added dict for service and page input options

This commit is contained in:
spbeach46 2021-05-25 18:47:23 -07:00
parent 35cdf8374f
commit 152b72f5df

View File

@ -24,7 +24,7 @@ class FindingApi:
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
# examples of additional params you may want to add:
# 'itemFilter(0).value':'Used'
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
@ -83,10 +83,13 @@ class FindingApi:
for future in executor.map(lambda p: self.get_data(*p), args):
data = future
'''
These try excepts may be unnecessary.
'''
try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list
training = pd.read_csv('training.csv')
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
if (item not in training.values) and (item not in itemid_results_list):
if (item not in training.values) and (item not in itemid_results_list): # might not be required
itemid_results_list.append(item['itemId'][0])
except (pd.errors.EmptyDataError, FileNotFoundError):
@ -184,7 +187,21 @@ class ShoppingApi:
data = json.load(f)
except (FileNotFoundError, ValueError):
data = []
finding = FindingApi(4, 2) # TODO replace these test values before production
service_dict = {
0: 'findItemsAdvanced', 1: 'findCompletedItems',
2: 'findItemsAdvanced', 3: 'findCompletedItems',
4: 'findItemsByProduct'}
service_dict
fnd_srvc = int(input(str(service_dict) + "choose Finding call: ('press enter' for default(4))"))
pg_num = int(input('how many pages? (76 max)'))
if fnd_srvc != '':
finding = FindingApi(fnd_srvc, pg_num) # TODO replace these test values before production or add option to change prior to execution
else:
fnd_srvc = 4
finding = FindingApi(fnd_srvc, pg_num)
item_id_results = finding.get_ids_from_cats()
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
@ -229,7 +246,12 @@ class CurateData:
'''
to_json = json.dumps(raw_data)
raw_df = pd.read_json(to_json)
return raw_df # TODO save csv here?
raw_df.to_csv('raw_df.csv', mode='a')
raw_df = pd.read_csv('raw_df.csv', index_col=0)
raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure
return raw_df
def to_training(self, raw_data): # NOTE need to create copies not views
'''
@ -346,9 +368,18 @@ class CurateData:
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
# TODO open csv here, drop duplicates and save again unless there's a better way
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
# Append to master training dataframes, drop potential dupes and save
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
return expanded_class, expanded_dropd
def dl_pictures(self, *args):
@ -454,3 +485,10 @@ def main():
if __name__ == "__main__":
main()
'''
Based on your sample set of 10 images, if you have an average of 5 images per
listing and you download a hundred listings, you will have about 102 Gb of
image data. That's just for one day. If you have more than a million listings
you're looking at a little over 1Tb of image data. You don't even know if this
is good data yet.
'''