diff --git a/ebay_api.py b/ebay_api.py index fe27100..85b3dae 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -24,7 +24,7 @@ class FindingApi: self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints # examples of additional params you may want to add: - # 'itemFilter(0).value':'Used' + # 'itemFilter(0).value':'Used' consider using this with findCompletedItems call # 'itemFilter(1).name':'ListingType' # 'itemFilter(1).value':'AuctionWithBIN' @@ -83,10 +83,13 @@ class FindingApi: for future in executor.map(lambda p: self.get_data(*p), args): data = future + ''' + These try excepts may be unnecessary. + ''' try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list training = pd.read_csv('training.csv') for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: - if (item not in training.values) and (item not in itemid_results_list): + if (item not in training.values) and (item not in itemid_results_list): # might not be required itemid_results_list.append(item['itemId'][0]) except (pd.errors.EmptyDataError, FileNotFoundError): @@ -184,7 +187,21 @@ class ShoppingApi: data = json.load(f) except (FileNotFoundError, ValueError): data = [] - finding = FindingApi(4, 2) # TODO replace these test values before production + + service_dict = { + 0: 'findItemsAdvanced', 1: 'findCompletedItems', + 2: 'findItemsAdvanced', 3: 'findCompletedItems', + 4: 'findItemsByProduct'} + service_dict + + fnd_srvc = int(input(str(service_dict) + "choose Finding call: ('press enter' for default(4))")) + pg_num = int(input('how many pages? (76 max)')) + if fnd_srvc != '': + finding = FindingApi(fnd_srvc, pg_num) # TODO replace these test values before production or add option to change prior to execution + else: + fnd_srvc = 4 + finding = FindingApi(fnd_srvc, pg_num) + item_id_results = finding.get_ids_from_cats() with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results): @@ -229,7 +246,12 @@ class CurateData: ''' to_json = json.dumps(raw_data) raw_df = pd.read_json(to_json) - return raw_df # TODO save csv here? + raw_df.to_csv('raw_df.csv', mode='a') + raw_df = pd.read_csv('raw_df.csv', index_col=0) + raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes) + raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure + + return raw_df def to_training(self, raw_data): # NOTE need to create copies not views ''' @@ -346,9 +368,18 @@ class CurateData: with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) - expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False) - # TODO open csv here, drop duplicates and save again unless there's a better way - expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False) + # Append to master training dataframes, drop potential dupes and save + + expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') + expanded_class = pd.read_csv('expanded_class.csv', index_col=0) + expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies + + expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') + expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0) + expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') + return expanded_class, expanded_dropd def dl_pictures(self, *args): @@ -454,3 +485,10 @@ def main(): if __name__ == "__main__": main() +''' +Based on your sample set of 10 images, if you have an average of 5 images per +listing and you download a hundred listings, you will have about 102 Gb of +image data. That's just for one day. If you have more than a million listings +you're looking at a little over 1Tb of image data. You don't even know if this +is good data yet. +'''