From 35cdf8374fd7e7c174a51ab86b79f2539a35b959 Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Tue, 11 May 2021 11:11:59 -0700 Subject: [PATCH] adding csv file save and appending functionality where needed --- ebay_api.py | 132 ++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/ebay_api.py b/ebay_api.py index 15665ae..fe27100 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -20,7 +20,7 @@ class FindingApi: 'findItemsAdvanced', 'findCompletedItems', 'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory', 'findItemsByProduct' - ][service] + ][service] # Currently using only index 4, i.e., service = 4 self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints # examples of additional params you may want to add: @@ -28,45 +28,11 @@ class FindingApi: # 'itemFilter(1).name':'ListingType' # 'itemFilter(1).value':'AuctionWithBIN' - def update_cats(self): - - parent_cats = ['3034', '93427'] - cat_list = [] - - for department in parent_cats: - - params = { - "callname":"GetCategoryInfo", - "appid":cfg.sec['SECURITY-APPNAME'], - "version":"671", - "responseencoding":"JSON", - "CategoryID":department, - "IncludeSelector":"ChildCategories", - } - - try: - response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) - response.raise_for_status() - - except requests.exceptions.RequestException: - print('connection error') - - response = response.json() - response = response['CategoryArray']['Category'][1:] - temp_cat_list = [cat['CategoryID'] for cat in response] - cat_list.extend(temp_cat_list) - - with open('cat_list.txt', 'w') as f: - json.dump(cat_list, f) - - # leaf_list = [node['LeafCategory'] for node in response] - return cat_list - def get_data(self, category_id, i): ''' - Gets raw JSON data fom FindingApi service call - Currently being used to get itemIDs from categories + Gets raw JSON data fom FindingApi service call. Currently being used to + get itemIDs from categories; ''' params = { @@ -144,6 +110,45 @@ class ShoppingApi: Creates objects from ShoppingApi service calls that can interact with pandas dataframes ''' + + def update_cats(self): + ''' + Updates cat_list.txt + ''' + + parent_cats = ['3034', '93427'] # Women's and Men's shoe departments + cat_list = [] + + for department in parent_cats: + + params = { + "callname":"GetCategoryInfo", + "appid":cfg.sec['SECURITY-APPNAME'], + "version":"671", + "responseencoding":"JSON", + "CategoryID":department, + "IncludeSelector":"ChildCategories", + } + + try: + response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) + response.raise_for_status() + + except requests.exceptions.RequestException: + print('connection error') + + response = response.json() + response = response['CategoryArray']['Category'][1:] # excludes index + # 0 as this is parent node, i.e., women's or men's dept. + + temp_cat_list = [cat['CategoryID'] for cat in response] + cat_list.extend(temp_cat_list) + + with open('cat_list.txt', 'w') as f: + json.dump(cat_list, f) + + # leaf_list = [node['LeafCategory'] for node in response] + def get_item_from_findItemsByCategory(self, twenty_id): ''' Gets raw JSON data from multiple live listings given multiple itemIds @@ -157,7 +162,6 @@ class ShoppingApi: "IncludeSelector":"ItemSpecifics", } - # TODO Add try excepts here try: response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) response.raise_for_status() @@ -175,21 +179,33 @@ class ShoppingApi: Runs get_item_from_findItemsByCategory in multiple threads to get relevant data for creating training sets ''' - data = [] + try: + with open('raw_data.txt') as f: + data = json.load(f) + except (FileNotFoundError, ValueError): + data = [] finding = FindingApi(4, 2) # TODO replace these test values before production item_id_results = finding.get_ids_from_cats() with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results): - # print(future) for item in future: data.append(item) # The end result should be a list of dicts where each dict in the list is a listing # data.update(future) - # TODO save data here. You'll use this with your curate data class. SAve this as text file + with open('raw_data.txt', 'w') as f: + json.dump(data, f) return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list. # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. # TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. +# NOTE: + +# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items +# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have +# to divide these up into the categories. This will leave you with about 6.25K results per cat. +# More than enough data for your dataset. + + class CurateData: ''' Contains methods for curating data for machine learning training sets; @@ -199,7 +215,8 @@ class CurateData: def import_raw(self): ''' - imports raw response json from local file + imports raw response json from local file. This is data from + GetMultipleItems call in ShoppingApi ''' with open('raw_data.txt') as f: raw_data = json.load(f) @@ -212,7 +229,7 @@ class CurateData: ''' to_json = json.dumps(raw_data) raw_df = pd.read_json(to_json) - return raw_df + return raw_df # TODO save csv here? def to_training(self, raw_data): # NOTE need to create copies not views ''' @@ -293,20 +310,24 @@ class CurateData: dropd = nvl_training.drop(col_drop, axis=1) return dropd +# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which +# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. + +# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type) + def expand_nvlclass(self, class_training, dropd): ''' takes image url list from each cell and expands them into separate/duplicate instances. Modifies both class training and dropd dfs. Appends custom image url dict {'source':'target'}. ''' - expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object + expanded_class = class_training.explode('PictureURL').reset_index(drop=True) expanded_class = expanded_class.dropna(subset=['PictureURL']) expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - # expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) - expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object + + expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) expanded_dropd = expanded_dropd.dropna(subset=['PictureURL']) expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - # expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values @@ -325,7 +346,9 @@ class CurateData: with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) - # TODO still need to save these as csv files + expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False) + # TODO open csv here, drop duplicates and save again unless there's a better way + expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False) return expanded_class, expanded_dropd def dl_pictures(self, *args): @@ -431,16 +454,3 @@ def main(): if __name__ == "__main__": main() -# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items -# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have -# to divide these up into the categories. This will leave you with about 6.25K results per cat. -# More than enough data for your dataset. - -# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which -# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. - -# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO -# TO AVOID HICCUPS WHEN CREATING DATASET -# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. - -# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)