added dict for service and page input options
This commit is contained in:
parent
35cdf8374f
commit
152b72f5df
52
ebay_api.py
52
ebay_api.py
@ -24,7 +24,7 @@ class FindingApi:
|
||||
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
|
||||
|
||||
# examples of additional params you may want to add:
|
||||
# 'itemFilter(0).value':'Used'
|
||||
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
|
||||
# 'itemFilter(1).name':'ListingType'
|
||||
# 'itemFilter(1).value':'AuctionWithBIN'
|
||||
|
||||
@ -83,10 +83,13 @@ class FindingApi:
|
||||
for future in executor.map(lambda p: self.get_data(*p), args):
|
||||
data = future
|
||||
|
||||
'''
|
||||
These try excepts may be unnecessary.
|
||||
'''
|
||||
try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list
|
||||
training = pd.read_csv('training.csv')
|
||||
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
|
||||
if (item not in training.values) and (item not in itemid_results_list):
|
||||
if (item not in training.values) and (item not in itemid_results_list): # might not be required
|
||||
itemid_results_list.append(item['itemId'][0])
|
||||
|
||||
except (pd.errors.EmptyDataError, FileNotFoundError):
|
||||
@ -184,7 +187,21 @@ class ShoppingApi:
|
||||
data = json.load(f)
|
||||
except (FileNotFoundError, ValueError):
|
||||
data = []
|
||||
finding = FindingApi(4, 2) # TODO replace these test values before production
|
||||
|
||||
service_dict = {
|
||||
0: 'findItemsAdvanced', 1: 'findCompletedItems',
|
||||
2: 'findItemsAdvanced', 3: 'findCompletedItems',
|
||||
4: 'findItemsByProduct'}
|
||||
service_dict
|
||||
|
||||
fnd_srvc = int(input(str(service_dict) + "choose Finding call: ('press enter' for default(4))"))
|
||||
pg_num = int(input('how many pages? (76 max)'))
|
||||
if fnd_srvc != '':
|
||||
finding = FindingApi(fnd_srvc, pg_num) # TODO replace these test values before production or add option to change prior to execution
|
||||
else:
|
||||
fnd_srvc = 4
|
||||
finding = FindingApi(fnd_srvc, pg_num)
|
||||
|
||||
item_id_results = finding.get_ids_from_cats()
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
|
||||
@ -229,7 +246,12 @@ class CurateData:
|
||||
'''
|
||||
to_json = json.dumps(raw_data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df # TODO save csv here?
|
||||
raw_df.to_csv('raw_df.csv', mode='a')
|
||||
raw_df = pd.read_csv('raw_df.csv', index_col=0)
|
||||
raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
|
||||
raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure
|
||||
|
||||
return raw_df
|
||||
|
||||
def to_training(self, raw_data): # NOTE need to create copies not views
|
||||
'''
|
||||
@ -346,9 +368,18 @@ class CurateData:
|
||||
with open('temp_pics_source_list.txt', 'w') as f:
|
||||
json.dump(temp_pics_source_list, f)
|
||||
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
|
||||
# TODO open csv here, drop duplicates and save again unless there's a better way
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
|
||||
# Append to master training dataframes, drop potential dupes and save
|
||||
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
|
||||
expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
|
||||
expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
|
||||
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
|
||||
expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
def dl_pictures(self, *args):
|
||||
@ -454,3 +485,10 @@ def main():
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
'''
|
||||
Based on your sample set of 10 images, if you have an average of 5 images per
|
||||
listing and you download a hundred listings, you will have about 102 Gb of
|
||||
image data. That's just for one day. If you have more than a million listings
|
||||
you're looking at a little over 1Tb of image data. You don't even know if this
|
||||
is good data yet.
|
||||
'''
|
||||
|
Loading…
Reference in New Issue
Block a user