2021-11-10 22:37:26 +00:00
import os
2021-11-27 05:38:01 +00:00
from time import sleep
from random import randint
2021-11-27 01:46:51 +00:00
import scrape_ids
2021-11-21 23:05:20 +00:00
from datetime import datetime , timedelta
import dateutil
from dateutil import parser
import pytz
2021-11-21 19:47:04 +00:00
import pdb
2021-11-10 22:37:26 +00:00
from io import StringIO
import numpy as np
import concurrent . futures
import json
import requests
import pandas as pd
import config as cfg
import shutil
import re
2021-11-30 19:32:34 +00:00
from ebaysdk . exception import ConnectionError
from ebaysdk . trading import Connection as Trading
2021-11-10 22:37:26 +00:00
from ebaysdk . finding import Connection as Finding
from ebaysdk . shopping import Connection as Shopping
2021-12-31 22:30:06 +00:00
class FindingApi :
'''
Methods for accessing eBay ' s FindingApi services
'''
2022-01-03 20:32:42 +00:00
def __init__ ( self , service ) :
2021-12-31 22:30:06 +00:00
self . service = [
' findItemsAdvanced ' , ' findCompletedItems ' ,
' findItemsByKeywords ' , ' findItemsIneBayStores ' , ' findItemsByCategory ' ,
' findItemsByProduct '
] [ service ] # Currently using only index 4, i.e., service = 4
2022-01-03 20:32:42 +00:00
# examples of additional params you may want to add:
2021-12-31 22:30:06 +00:00
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
# 'StartTimeNewest'
# HideDuplicateItems
2022-01-03 20:32:42 +00:00
def get_data ( self , category_id ) :
2021-12-31 22:30:06 +00:00
'''
Gets raw JSON data fom FindingApi service call . Currently being used to
get itemIDs from categories ;
'''
# startTime = dateutil.parser.isoparse( startTime )
# now = datetime.datetime.now(tz=pytz.UTC)
# days_on_site = (now - startTime).days # as int
ids = [ ]
params = {
" OPERATION-NAME " : self . service ,
" SECURITY-APPNAME " : cfg . sec [ ' SECURITY-APPNAME ' ] ,
" SERVICE-VERSION " : " 1.13.0 " ,
" RESPONSE-DATA-FORMAT " : " JSON " ,
" categoryId " : category_id ,
2022-01-03 20:31:19 +00:00
" paginationInput.entriesPerPage " : " 100 " ,
2022-01-03 20:32:42 +00:00
" paginationInput.PageNumber " : " 1 " ,
2021-12-31 22:30:06 +00:00
" itemFilter(0).name " : " Condition " ,
" itemFilter(0).value " : " Used " ,
" itemFilter.name " : " HideDuplicateItems " ,
" itemFilter.value " : " true " ,
" sortOrder " : " StartTimeNewest " ,
}
2022-01-05 03:50:42 +00:00
# "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here
# "itemFilter(1).value":"true"
2021-12-31 22:30:06 +00:00
2022-01-03 20:32:42 +00:00
try :
response = requests . get ( " https://svcs.ebay.com/services/search/FindingService/v1 " ,
params = params , timeout = 24 )
response . raise_for_status ( )
except requests . exceptions . RequestException : # appears this works need to be able to continue where you left off or use better timeout?
print ( ' connection error ' )
return ids
try :
data = response . json ( )
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
ids . append ( item [ ' itemId ' ] [ 0 ] )
ids = list ( set ( ids ) )
except ( AttributeError , KeyError ) :
print ( ' AttributeError or KeyError. Exiting ' )
print ( response . json ( ) )
return ids
return ids
2021-12-31 22:30:06 +00:00
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
# it to the first 5 pictures instead of random.
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
# to see which one is more accurate or if a combo of both is more accurate.
2022-01-03 20:32:42 +00:00
def get_ids_from_cats ( self ) :
2021-12-31 22:30:06 +00:00
'''
Creates a 20 - itemId list to use for the ShoppingApi
call
'''
itemid_results_list = [ ]
with open ( ' cat_list.txt ' ) as jf :
cat_list = json . load ( jf )
2022-01-03 20:32:42 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
for future in executor . map ( self . get_data , cat_list ) :
itemid_results_list . extend ( future )
2021-12-31 22:30:06 +00:00
print ( len ( itemid_results_list ) )
a = list ( set ( itemid_results_list ) )
print ( len ( a ) )
with open ( ' raw_ids.txt ' , ' w ' ) as f :
json . dump ( itemid_results_list , f )
2022-01-03 20:32:42 +00:00
# 20-ItemID list created to maximize dataset/decrease calls given call constraints
2021-12-31 22:30:06 +00:00
item_id_results = [ ' , ' . join ( itemid_results_list [ n : n + 20 ] ) for n in list ( range ( 0 ,
2022-01-03 20:32:42 +00:00
len ( itemid_results_list ) , 20 ) ) ]
2021-12-31 22:30:06 +00:00
return item_id_results , itemid_results_list
2021-11-10 22:37:26 +00:00
class ShoppingApi :
'''
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
'''
def update_cats ( self ) :
'''
Updates cat_list . txt
'''
parent_cats = [ ' 3034 ' , ' 93427 ' ] # Women's and Men's shoe departments
cat_list = [ ]
for department in parent_cats :
2021-12-07 20:46:24 +00:00
headers = {
" X-EBAY-API-IAF-TOKEN " : cfg . sec [ ' X-EBAY-API-IAF-TOKEN ' ] , # TODO implement auto oauth token renewal
2021-11-10 22:37:26 +00:00
" version " : " 671 " ,
}
2021-12-07 20:46:24 +00:00
url = " https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID= " + department
2021-11-10 22:37:26 +00:00
try :
2021-12-07 20:46:24 +00:00
response = requests . get ( url , headers = headers , timeout = 4 )
2021-11-10 22:37:26 +00:00
response . raise_for_status ( )
except requests . exceptions . RequestException :
print ( ' connection error ' )
response = response . json ( )
2021-11-30 19:32:34 +00:00
response = response [ ' CategoryArray ' ] [ ' Category ' ] [ 1 : ] # excludes index 0 as this is parent node, i.e., women's or men's dept.
2021-11-10 22:37:26 +00:00
temp_cat_list = [ cat [ ' CategoryID ' ] for cat in response ]
cat_list . extend ( temp_cat_list )
2021-12-13 02:22:09 +00:00
with open ( ' cat_list.txt ' , ' w ' ) as f :
2021-11-10 22:37:26 +00:00
json . dump ( cat_list , f )
def get_item_from_findItemsByCategory ( self , twenty_id ) :
'''
2022-01-07 09:47:22 +00:00
Gets raw JSON data from multiple live listings given multiple itemIds
2021-11-10 22:37:26 +00:00
'''
2021-11-27 05:38:01 +00:00
2021-11-10 22:37:26 +00:00
headers = {
" X-EBAY-API-IAF-TOKEN " : cfg . sec [ ' X-EBAY-API-IAF-TOKEN ' ] , # TODO implement auto oauth token renewal
" version " : " 671 " ,
}
url = " https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID= " + twenty_id
try :
2021-11-27 05:38:01 +00:00
# random sleep here between 0 and 10 secs?
2021-11-28 07:17:34 +00:00
2021-12-07 20:46:24 +00:00
# sleep(randint(1,10)) # may not be necessary
2021-11-27 05:38:01 +00:00
response = requests . get ( url , headers = headers , timeout = 24 )
2021-11-10 22:37:26 +00:00
response . raise_for_status ( )
2021-11-28 07:17:34 +00:00
response = response . json ( )
2022-01-05 03:53:34 +00:00
item = response [ ' Item ' ]
2021-11-28 07:17:34 +00:00
2021-11-10 22:37:26 +00:00
2021-11-30 19:32:34 +00:00
except ( requests . exceptions . RequestException , KeyError ) :
2021-11-28 07:17:34 +00:00
print ( ' connection error. IP limit possibly exceeded ' )
2022-01-05 03:53:34 +00:00
print ( response )
return # returns NoneType. Handled at conky()
2021-11-10 22:37:26 +00:00
2022-01-05 03:53:34 +00:00
return item
2021-11-10 22:37:26 +00:00
2022-01-05 03:53:34 +00:00
def conky ( self , twenty_ids_list ) :
2021-11-10 22:37:26 +00:00
'''
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
data for creating training sets
'''
try :
with open ( ' raw_data.txt ' ) as f :
data = json . load ( f )
2021-11-30 19:32:34 +00:00
except ( FileNotFoundError , ValueError ) :
2021-11-10 22:37:26 +00:00
data = [ ]
2021-11-30 19:32:34 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2022-01-05 03:53:34 +00:00
for future in executor . map ( self . get_item_from_findItemsByCategory , twenty_ids_list ) :
2021-11-28 08:19:59 +00:00
if future is not None :
for item in future :
2021-11-30 19:32:34 +00:00
data . append ( item ) # The end result should be a list of dicts where each dict in the list is a listing
2021-11-28 08:19:59 +00:00
else :
2022-01-07 09:47:22 +00:00
print ( ' response is None ' )
2021-11-28 08:19:59 +00:00
break
2021-11-10 22:37:26 +00:00
with open ( ' raw_data.txt ' , ' w ' ) as f :
2021-11-28 08:19:59 +00:00
json . dump ( data , f )
2022-01-07 09:47:22 +00:00
return data
2021-11-10 22:37:26 +00:00
# NOTE:
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
class CurateData :
'''
Contains methods for curating data for machine learning training sets ;
Takes item in data from ShoppingApi request as argument and extracts / creates key
value pairs that gets updated to custom dataframe used in Ml training sets .
'''
def import_raw ( self ) :
'''
imports raw response json from local file . This is data from
GetMultipleItems call in ShoppingApi
'''
with open ( ' raw_data.txt ' ) as f :
raw_data = json . load ( f )
return raw_data
def raw_df ( self , raw_data ) : # TODO not dropping dupes, and is appending raw_data for some reason
'''
creates pandas df from raw json and saves master raw csv file as raw_df . csv .
Indended to be used inline with direct
data stream from ebay ' s APIs
'''
to_json = json . dumps ( raw_data )
raw_df = pd . read_json ( StringIO ( to_json ) )
raw_df . to_csv ( ' raw_df.csv ' ) # NOTE not append mode because raw_df is made from the master raw_data.txt file
#raw_df = pd.read_csv('raw_df.csv', index_col=0)
#raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
#raw_df.to_csv('raw_df.csv')
# TODO still saving "Unnamed:0" column
return raw_df
def to_training ( self , raw_data ) :
'''
creates first pass of potential labels for training set . This is the base
df used to produce other training sets to use .
'''
raw_df = self . raw_df ( raw_data )
interm_df1 = raw_df . loc [ : , [ ' ItemID ' , ' PictureURL ' , ' PrimaryCategoryID ' , ' PrimaryCategoryName ' , ' Title ' , ' ItemSpecifics ' ] ]
interm_df1 [ [ ' ItemID ' , ' PrimaryCAegoryID ' ] ] = interm_df1 . loc [ : , [ ' ItemID ' , ' PrimaryCategoryID ' ] ] . astype ( str )
training = interm_df1 . dropna ( subset = [ ' ItemSpecifics ' ] )
return training # TODO RENAME THIS FUNC AND its RETURN VALUE
def class_training ( self , training ) :
''' Training set for multiclass portion of training set. Used to train
seprately from multilabel portion
'''
class_training = training . loc [ : , [ ' PictureURL ' , ' PrimaryCategoryID ' ] ]
return class_training
def nvl_training ( self , training ) :
'''
Training set for multilabel portion
'''
interm_df1 = pd . Series ( training . ItemSpecifics )
interm_df1 = interm_df1 . apply ( lambda x : x [ ' NameValueList ' ] )
# Necessary for json_normalize():
nvl_dict = interm_df1 . apply ( lambda x : { k : v for ( k , v ) in zip ( [ n [ ' Name ' ] for n in x ] , [ v [ ' Value ' ] for v in x ] ) } )
nvl_df = pd . json_normalize ( nvl_dict )
nvl_training = pd . concat ( [ pd . Series ( training . PictureURL ) , nvl_df ] , axis = 1 )
return nvl_training
def extract_df ( self , df ) :
'''
converts single - value lists of strings of any df to string if not null
'''
extracted_df = df . applymap ( lambda x : ' ' . join ( x ) if isinstance ( x , list ) else np . nan if pd . isnull ( x ) else x )
return extracted_df
def drop_nvl_cols ( self , nvl_training ) :
with open ( ' cat_spacs.txt ' ) as f :
cat_spacs = json . load ( f )
drop = [ ' Year Manufactured ' , ' MPN ' , ' Platform Height ' , ' Product Line ' ,
' Personalize ' , ' Fabric Type ' , ' Customized ' , ' Release Year ' ,
' Heel to Toe Drop ' , ' Midsole Type ' , ' Cleat Type ' , ' Handmade ' ,
' Signed ' , ' Silhouette ' , ' Insole Material ' , ' Lining Material ' ,
' California Prop 65 Warning ' , ' Character Family ' , ' Character ' ,
' Cushioning Level ' , ' Personalization Instructions ' , ' Pronation ' ,
]
drop_2 = [ ' Calf Width ' , ' Theme ' , ' Outsole Material ' , ' Style Code ' , ' Features ' ,
' EU Shoe Size ' , ' AU Shoe Size ' , ' Vintage ' , ' US Shoe Size ' ,
' Country/Region of Manufacture ' , ' Brand ' , ' Model ' ]
for cat in drop :
if cat in cat_spacs :
cat_spacs . remove ( cat )
for cat in drop_2 :
if cat in cat_spacs :
cat_spacs . remove ( cat )
user_input = input ( ' drop cols? (y,n; default=y): ' )
if ' n ' in user_input :
dropd = nvl_training #.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
else :
cols = [ ]
for col in cat_spacs :
if col in list ( nvl_training . columns ) :
cols . append ( col )
cols . insert ( 0 , ' PictureURL ' ) # list of other cols that aren't needed for training
dropd = nvl_training [ cols ]
return dropd
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
def expand_nvlclass ( self , class_training , dropd ) :
'''
takes image url list from each cell and expands them into separate / duplicate
instances . Modifies both class training and dropd dfs . Appends custom
image url dict { ' source ' : ' target ' } .
* consider applying this function to other cells that have multiple values in their lists
'''
expand = input ( " expand image list or use primary listing image? (y or n): " )
if ( ' y ' or ' Y ' ) in expand :
expanded_class = class_training . explode ( ' PictureURL ' ) . reset_index ( drop = True )
expanded_class = expanded_class . dropna ( subset = [ ' PictureURL ' ] )
expanded_class = expanded_class . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
expanded_dropd = dropd . explode ( ' PictureURL ' ) . reset_index ( drop = True )
expanded_dropd = expanded_dropd . dropna ( subset = [ ' PictureURL ' ] )
expanded_dropd = expanded_dropd . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
expanded_dropd = self . extract_df ( expanded_dropd ) # convert lists to values
2022-01-07 09:47:22 +00:00
temp_pics_source_list = list ( set ( expanded_class . PictureURL . to_list ( ) ) )
2021-11-10 22:37:26 +00:00
else :
2021-12-13 02:22:09 +00:00
class_training [ ' PictureURL ' ] = class_training [ ' PictureURL ' ] . apply ( lambda x : x [ 0 ] if len ( x ) > 0 else np . nan )
expanded_class = class_training . dropna ( )
dropd = dropd . dropna ( subset = [ ' PictureURL ' ] )
dropd [ ' PictureURL ' ] = dropd [ ' PictureURL ' ] . apply ( lambda x : x [ 0 ] if len ( x ) > 0 else np . nan )
dropd = dropd . dropna ( subset = [ ' PictureURL ' ] )
2021-11-10 22:37:26 +00:00
expanded_dropd = dropd
expanded_dropd = self . extract_df ( expanded_dropd ) # convert lists to values
2022-01-07 09:47:22 +00:00
# retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
2021-11-10 22:37:26 +00:00
temp_pics_source_list = list ( set ( expanded_class . PictureURL . to_list ( ) ) )
try :
with open ( ' temp_pics_source_list.txt ' ) as f :
tpsl = json . load ( f )
tpsl . extend ( temp_pics_source_list )
temp_pics_source_list = list ( set ( tpsl ) )
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
json . dump ( temp_pics_source_list , f )
except ( ValueError , FileNotFoundError ) :
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
json . dump ( temp_pics_source_list , f )
# Append to master training dataframes, drop potential dupes and save
expanded_class . to_csv ( ' expanded_class.csv ' )
expanded_dropd . to_csv ( ' expanded_dropd.csv ' )
return expanded_class , expanded_dropd
2022-01-07 09:47:22 +00:00
def dl_pictures ( self ) :
2021-11-10 22:37:26 +00:00
'''
Downloads pictures from api to local storage using temp_pics_source_list
and creates custom { source : target } dictionary as dict_pics
'''
try :
with open ( ' target_dirs.txt ' , ' r+ ' ) as f : # TODO you can add option to change directory here, too. Look up how to have optional arguments
target_dir = json . load ( f )
except ( ValueError , FileNotFoundError ) :
target_dir = input ( ' No target dirctory found. Create One? [y] or [n]: ' )
if target_dir == ( ' y ' or ' Y ' ) :
target_dir = input ( ' Please provide full URL to destination folder: ' ) # TODO need to catch human syntax errors here
with open ( ' target_dirs.txt ' , ' w ' ) as f :
json . dump ( target_dir , f )
else :
os . mkdir ( os . getcwd ( ) + os . sep + ' training_images ' )
target_dir = os . getcwd ( ) + os . sep + ' training_images '
with open ( ' target_dirs.txt ' , ' w ' ) as f :
json . dump ( target_dir , f )
print ( ' Creating default folder in current directory @ ' + target_dir )
with open ( ' temp_pics_source_list.txt ' ) as f :
try :
2022-01-07 09:47:22 +00:00
temp_pics_source_list = json . load ( f )
2021-11-10 22:37:26 +00:00
except ( ValueError , FileNotFoundError ) :
2022-01-07 09:47:22 +00:00
print ( ' url list not found. download aborted ' )
return
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
dict_pics = { k : target_dir + os . sep + re . search ( r ' [^/]+(?=/ \ $_|.jpg) ' , k , re . IGNORECASE ) . group ( ) + ' .jpg ' for k in temp_pics_source_list }
with open ( ' dict_pics.txt ' , ' w ' ) as f :
json . dump ( dict_pics , f )
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
def dl_pic ( dict_pics , pic ) :
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
try :
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
if os . path . exists ( dict_pics [ pic ] ) :
pass # TODO should catch dupes, but make sure it is
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
else :
try :
r = requests . get ( pic , stream = True )
r . raw . decode_content = True
with open ( dict_pics [ pic ] , ' wb ' ) as f : # Or call dict_pics[pic] can work
shutil . copyfileobj ( r . raw , f )
except ConnectionError :
2021-11-10 22:37:26 +00:00
2022-01-07 09:47:22 +00:00
return
except KeyError :
pass
2021-11-10 22:37:26 +00:00
bargs = [ ( dict_pics , pic ) for pic in temp_pics_source_list ]
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
for future in executor . map ( lambda p : dl_pic ( * p ) , bargs ) :
2021-11-30 19:32:34 +00:00
if future is not None :
future
else :
print ( ' connection error ' )
2021-11-10 22:37:26 +00:00
os . remove ( ' temp_pics_source_list.txt ' ) # Deletes file after downloads complete successfully
class PreProcessing :
'''
Includes methods for pre - processing training set input and labels in the
training set created from CurateData class . Whereas CurateData training
sets provided trimmed down data from the raw json response from the
ShoppingApi call and provided a bare minimum format for the dataframe to be
used in training , PreProcessing optimizes that dataframe for training and
includes methods for image manipulation , creating test / train / validation
splits , etc .
'''
2021-12-31 22:08:48 +00:00
def dict_pics ( self ) :
2021-11-10 22:37:26 +00:00
'''
Source to target training . Replaces source image URL with target URL
determined by values in dict_pics variable .
'''
2021-12-18 05:26:55 +00:00
target_dir = os . getcwd ( )
with open ( ' temp_pics_source_list.txt ' ) as f :
temp_pics_source_list = json . load ( f )
2021-12-31 22:08:48 +00:00
dict_pics = { k : target_dir + os . sep + re . search ( r ' [^/]+(?=/ \ $_|.jpg) ' , k , re . IGNORECASE ) . group ( ) + ' .jpg ' for k in temp_pics_source_list }
2021-12-18 05:26:55 +00:00
print ( " { source:target} dictionary created @ " + os . getcwd ( ) + os . sep + ' training_images ' )
2021-12-31 22:08:48 +00:00
return dict_pics
2021-11-10 22:37:26 +00:00
# TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
# if not exists and append to master img download dict
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
def main ( ) :
'''
Main program creates / updates a csv file to use for ML training from live
ebay listings
'''
pass
# main goes here:
if __name__ == " __main__ " :
main ( )
'''
Based on your sample set of 10 images , if you have an average of 5 images per
listing and you download a hundred listings , you will have about 102 Gb of
image data . That ' s just for one day. If you have more than a million listings
you ' re looking at a little over 1Tb of image data. You don ' t even know if this
is good data yet .
'''