import os from time import sleep from random import randint import scrape_ids from datetime import datetime, timedelta import dateutil from dateutil import parser import pytz import pdb from io import StringIO import numpy as np import concurrent.futures import json import requests import pandas as pd import config as cfg import shutil import re from ebaysdk.trading import Connection as Trading from ebaysdk.exception import ConnectionError from ebaysdk.finding import Connection as Finding from ebaysdk.shopping import Connection as Shopping class FindingApi: ''' Methods for accessing eBay's FindingApi services ''' def __init__(self, service, idspc): self.service = [ 'findItemsAdvanced', 'findCompletedItems', 'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory', 'findItemsByProduct' ][service] # Currently using only index 4, i.e., service = 4 self.idspc = idspc # examples of additional params you may want to add: # 'itemFilter(0).value':'Used' consider using this with findCompletedItems call # 'itemFilter(1).name':'ListingType' # 'itemFilter(1).value':'AuctionWithBIN' # 'StartTimeNewest' # HideDuplicateItems def get_data(self, category_id, idspc): ''' Gets raw JSON data fom FindingApi service call. Currently being used to get itemIDs from categories; ''' # startTime = dateutil.parser.isoparse( startTime ) # now = datetime.datetime.now(tz=pytz.UTC) # days_on_site = (now - startTime).days # as int ids = [] modTimeFrom = datetime.now() - timedelta(seconds=5) # initialize modTimeFrom value i = 1 params = { "OPERATION-NAME":self.service, "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, "paginationInput.entriesPerPage":"20", "paginationInput.PageNumber":i, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used", "itemFilter.name":"HideDuplicateItems", "itemFilter.value":"true", "sortOrder":"StartTimeNewest", } # "itemFilter.name(2)":"modTimeFrom", # "itemFilter.value(2)":modTimeFrom, while len(ids) < idspc: try: response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", params=params, timeout=24) response.raise_for_status() except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout? print('connection error') return ids try: data = response.json() itemSearchURL = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] modTimeFrom = data['findItemsByCategoryResponse'][0]['searchResult'][0]['item'][-1]['listingInfo'][0]['startTime'][0] modTimeFrom = dateutil.parser.isoparse( modTimeFrom ) modTimeFrom = modTimeFrom - timedelta(seconds=5) # TODO NEED BACK TO GMT FORMAT for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: # if item not in ids: ids.append(item['itemId'][0]) #ids = list(set(ids)) except (AttributeError, KeyError): print('AttributeError or KeyError. Exiting') print(response.json()) return ids input('press enter to continue') i+=1 params = { "OPERATION-NAME":self.service, "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, "paginationInput.entriesPerPage":"20", "paginationInput.PageNumber":i, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used", "itemFilter.name":"HideDuplicateItems", "itemFilter.value":"true", "sortOrder":"StartTimeNewest", } return ids, data, modTimeFrom, itemSearchURL # TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe # try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a # random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting # it to the first 5 pictures instead of random. # You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used # to see which one is more accurate or if a combo of both is more accurate. def get_ids_from_cats(self): #TODO need to resolve duplicates here to maximize unique ids/data and ShopppingApi call ''' Creates a 20-itemId list to use for the ShoppingApi call ''' # target_idspc = self.target_idspc idspc = self.idspc itemid_results_list = [] with open('cat_list.txt') as jf: cat_list = json.load(jf) for cat in cat_list: args = [(cat, idspc) for cat in cat_list] with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(lambda p: self.get_data(*p), args): itemid_results_list.extend(future) print(len(itemid_results_list)) a = list(set(itemid_results_list)) print(len(a)) input('press enter to continue') with open('raw_ids.txt', 'w') as f: json.dump(itemid_results_list, f) item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0, len(itemid_results_list), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints return item_id_results, itemid_results_list # TODO during your try except conditionals just check the csv files. At the end you can create sets. You can creat another condition that says if the final set is smaller than 100k then you can call finding # service on more pages (but only pages you haven't tried) and repeat the search process. # TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set() # for category_id in cat_list: class ShoppingApi: ''' Creates objects from ShoppingApi service calls that can interact with pandas dataframes ''' def update_cats(self): ''' Updates cat_list.txt ''' parent_cats = ['3034', '93427'] # Women's and Men's shoe departments cat_list = [] for department in parent_cats: params = { "callname":"GetCategoryInfo", "X-EBAY-API-IAF-TOKEN":cfg.sec['X-EBAY-API-IAF-TOKEN'], #TODO change to match format of get_itemget_item_from_findItemsByCategory() "version":"671", "responseencoding":"JSON", "CategoryID":department, "IncludeSelector":"ChildCategories", } try: response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=4) response.raise_for_status() except requests.exceptions.RequestException: print('connection error') response = response.json() response = response['CategoryArray']['Category'][1:] # excludes index # 0 as this is parent node, i.e., women's or men's dept. temp_cat_list = [cat['CategoryID'] for cat in response] cat_list.extend(temp_cat_list) with open('cat_list.txt', 'w') as f: json.dump(cat_list, f) # leaf_list = [node['LeafCategory'] for node in response] def get_item_from_findItemsByCategory(self, twenty_id): ''' Gets raw JSON data from multiple live listings given multiple itemIds ''' with open('ids.txt') as f: ids = json.load(f) item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints headers = { "X-EBAY-API-IAF-TOKEN":cfg.sec['X-EBAY-API-IAF-TOKEN'], # TODO implement auto oauth token renewal "version":"671", } url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id try: # random sleep here between 0 and 10 secs? sleep(randint(1,10)) response = requests.get(url, headers=headers,timeout=24) response.raise_for_status() print('index number {}'.format(item_id_results.index(twenty_id))) except requests.exceptions.RequestException: # TODO need better handling print('connection error') print('index number {}'.format(item_id_results.index(twenty_id))) return response = response.json() response = response['Item'] return response def conky(self): ''' Runs get_item_from_findItemsByCategory in multiple threads to get relevant data for creating training sets ''' try: with open('raw_data.txt') as f: data = json.load(f) except (FileNotFoundError, ValueError): # TODO not catching error data = [] try: with open('ids.txt') as f: ids = json.load(f) item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints except (FileNotFoundError, ValueError): item_id_results = scrape_ids.main() with concurrent.futures.ThreadPoolExecutor() as executor: # NOTE may need to include sleeps to avoid connection refusal due to overwhelming servers for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results): for item in future: data.append(item) # The end result should be a list of dicts where each dict in the list is a listing # data.update(future) with open('raw_data.txt', 'w') as f: json.dump(data, f) # TODO maybe write for every future returned to avoid losing data if your accidentally reach # the call limit and you get an error return data # each future is a list of dictionaries because the output of any multithreader in this method is a list. # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. # TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. # NOTE: # UPDATE** 76 pages seems like it might be too much. # Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items # per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have # to divide these up into the categories. This will leave you with about 6.25K results per cat. # More than enough data for your dataset. #class SDKTrading: # api = Trading(config_file='ebay.yaml') # response = api.execute('GetUser', {}) # print(response.dict()) # print(response.reply) class CurateData: ''' Contains methods for curating data for machine learning training sets; Takes item in data from ShoppingApi request as argument and extracts/ creates key value pairs that gets updated to custom dataframe used in Ml training sets. ''' def import_raw(self): ''' imports raw response json from local file. This is data from GetMultipleItems call in ShoppingApi ''' with open('raw_data.txt') as f: raw_data = json.load(f) return raw_data def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason ''' creates pandas df from raw json and saves master raw csv file as raw_df.csv. Indended to be used inline with direct data stream from ebay's APIs ''' to_json = json.dumps(raw_data) raw_df = pd.read_json(StringIO(to_json)) raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file #raw_df = pd.read_csv('raw_df.csv', index_col=0) #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this #raw_df.to_csv('raw_df.csv') # TODO still saving "Unnamed:0" column return raw_df def to_training(self, raw_data): ''' creates first pass of potential labels for training set. This is the base df used to produce other training sets to use. ''' raw_df = self.raw_df(raw_data) interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str) training = interm_df1.dropna(subset=['ItemSpecifics']) return training # TODO RENAME THIS FUNC AND its RETURN VALUE def class_training(self, training): '''Training set for multiclass portion of training set. Used to train seprately from multilabel portion ''' class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] return class_training def nvl_training(self, training): ''' Training set for multilabel portion ''' interm_df1 = pd.Series(training.ItemSpecifics) interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) # Necessary for json_normalize(): nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) nvl_df = pd.json_normalize(nvl_dict) nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) return nvl_training def extract_df(self, df): ''' converts single-value lists of strings of any df to string if not null ''' extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x) return extracted_df def drop_nvl_cols(self, nvl_training): with open('cat_spacs.txt') as f: cat_spacs = json.load(f) drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', 'Personalize', 'Fabric Type', 'Customized','Release Year', 'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade', 'Signed', 'Silhouette', 'Insole Material', 'Lining Material', 'California Prop 65 Warning', 'Character Family', 'Character', 'Cushioning Level', 'Personalization Instructions', 'Pronation', ] drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features', 'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', 'Country/Region of Manufacture', 'Brand', 'Model'] for cat in drop : if cat in cat_spacs: cat_spacs.remove(cat) for cat in drop_2: if cat in cat_spacs: cat_spacs.remove(cat) user_input = input('drop cols? (y,n; default=y): ') if 'n' in user_input: dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels else: cols = [] for col in cat_spacs: if col in list(nvl_training.columns): cols.append(col) cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training dropd = nvl_training[cols] return dropd # for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which # can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. # Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type) def expand_nvlclass(self, class_training, dropd): ''' takes image url list from each cell and expands them into separate/duplicate instances. Modifies both class training and dropd dfs. Appends custom image url dict {'source':'target'}. * consider applying this function to other cells that have multiple values in their lists ''' expand = input("expand image list or use primary listing image? (y or n): ") if ('y' or 'Y') in expand: expanded_class = class_training.explode('PictureURL').reset_index(drop=True) expanded_class = expanded_class.dropna(subset=['PictureURL']) expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) expanded_dropd = expanded_dropd.dropna(subset=['PictureURL']) expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # TODO because var is del after dl_pictures you may be # getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such # giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in # computate power reqs. So, figure out a way to make a true temp list based on the current call executed else: class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0]) expanded_class = class_training dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0]) expanded_dropd = dropd expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) try: with open('temp_pics_source_list.txt') as f: tpsl = json.load(f) tpsl.extend(temp_pics_source_list) temp_pics_source_list = list(set(tpsl)) with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) except (ValueError, FileNotFoundError): with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) # Append to master training dataframes, drop potential dupes and save expanded_class.to_csv('expanded_class.csv') # expanded_class = pd.read_csv('expanded_class.csv', index_col=0) # expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) # expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies expanded_dropd.to_csv('expanded_dropd.csv') # expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0) # expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) # expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') return expanded_class, expanded_dropd def dl_pictures(self, *args): ''' Downloads pictures from api to local storage using temp_pics_source_list and creates custom {source:target} dictionary as dict_pics ''' # TODO add option to include only first image of each listing as # others may be crappy for training. Also consider adding option to # reduce the size of each pic downloaded try: with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments target_dir = json.load(f) except (ValueError, FileNotFoundError): target_dir = input('No target dirctory found. Create One? [y] or [n]:') if target_dir == ('y' or 'Y'): target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here with open('target_dirs.txt','w') as f: json.dump(target_dir, f) else: os.mkdir(os.getcwd()+os.sep+'training_images') target_dir = os.getcwd()+os.sep+'training_images' with open('target_dirs.txt','w') as f: json.dump(target_dir, f) print('Creating default folder in current directory @ ' + target_dir) with open('temp_pics_source_list.txt') as f: try: if args: temp_pics_source_list = args[0] else: temp_pics_source_list = json.load(f) except (ValueError, FileNotFoundError): if args: temp_pics_sources_list = args[0] else: print('url list not found. download aborted') return temp_dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} try: with open('dict_pics.txt') as f: dict_pics = json.load(f) dict_pics.update(temp_dict_pics) # TODO This still creates duplicates with open('dict_pics.txt', 'w') as f: json.dump(dict_pics, f) except (ValueError, FileNotFoundError): with open('dict_pics.txt', 'w') as f: json.dump(temp_dict_pics, f) dict_pics = temp_dict_pics def dl_pic(dict_pics, pic): if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why? #TODO it would mean that temp_pics_source_list is changing for some reason? else: r = requests.get(pic, stream=True) r.raw.decode_content = True with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work shutil.copyfileobj(r.raw, f) bargs = [(dict_pics, pic) for pic in temp_pics_source_list] with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(lambda p: dl_pic(*p), bargs): future os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully class PreProcessing: ''' Includes methods for pre-processing training set input and labels in the training set created from CurateData class. Whereas CurateData training sets provided trimmed down data from the raw json response from the ShoppingApi call and provided a bare minimum format for the dataframe to be used in training, PreProcessing optimizes that dataframe for training and includes methods for image manipulation, creating test/train/validation splits, etc. ''' def stt_training(self, dict_pics, expanded_class, expanded_dropd): ''' Source to target training. Replaces source image URL with target URL determined by values in dict_pics variable. ''' pass # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures # if not exists and append to master img download dict # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted def main(): ''' Main program creates/updates a csv file to use for ML training from live ebay listings ''' pass # main goes here: if __name__ == "__main__": main() ''' Based on your sample set of 10 images, if you have an average of 5 images per listing and you download a hundred listings, you will have about 102 Gb of image data. That's just for one day. If you have more than a million listings you're looking at a little over 1Tb of image data. You don't even know if this is good data yet. '''