duplicate id fixes. full data not getting fetched

This commit is contained in:
scott 2021-11-15 21:01:02 -07:00
parent 6bdc94c8c6
commit 1faa4e86fd

View File

@ -20,13 +20,13 @@ class FindingApi:
Methods for accessing eBay's FindingApi services
'''
def __init__(self, service, pageNumber):
def __init__(self, service, target_idspc): #target ids per cat
self.service = [
'findItemsAdvanced', 'findCompletedItems',
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
'findItemsByProduct'
][service] # Currently using only index 4, i.e., service = 4
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
self.target_idspc = target_idspc
# examples of additional params you may want to add:
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
@ -35,7 +35,7 @@ class FindingApi:
# 'StartTimeNewest'
# HideDuplicateItems
def get_data(self, category_id, i):
def get_data(self, category_id, target_idspc):
'''
Gets raw JSON data fom FindingApi service call. Currently being used to
@ -47,6 +47,8 @@ class FindingApi:
Also consider using the exlude duplicates param and possibly others.
research ebay api docs to find cadidates
'''
i = 1
ids = []
params = {
"OPERATION-NAME":self.service,
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
@ -63,16 +65,40 @@ class FindingApi:
"itemFilter.value":'true'
}
# TODO add try excepts here
try:
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
params=params, timeout=4)
response.raise_for_status()
while len(ids) < target_idspc:
try:
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
params=params, timeout=7)
response.raise_for_status()
except requests.exceptions.RequestException:
print('connection error') #TODO DECIDE HOW TO HANDLE EXCEPTION
data = response.json()
return data
except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
print('connection error') #TODO DECIDE HOW TO HANDLE EXCEPTION
return ids
data = response.json()
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
if item not in ids:
ids.append(item['itemId'][0])
ids = list(set(ids))
i += 1
params = {
"OPERATION-NAME":self.service,
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"100",
"paginationInput.PageNumber":i,
"itemFilter(0).name":"Condition",
"itemFilter(0).value":"Used", # recommended is conditionId instead but for some reason that doesn't work either
# but may not be necessary anyways if you can eleminate dupes. TODO Still need to fix to work. Results are likely better than new items w/ shitty brands and pics
"sortOrder":"StartTimeNewest",
"itemFilter.name":"HideDuplicateItems", # this isn't working or is only working per page
"itemFilter.value":'true'
}
return ids
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
@ -87,27 +113,22 @@ class FindingApi:
Creates a 20-itemId list to use for the ShoppingApi
call
'''
pages = self.pageNumber
target_idspc = self.target_idspc
itemid_results_list = []
with open('cat_list.txt') as jf:
cat_list = json.load(jf)
args = []
for category_id in cat_list:
bargs = [(category_id, i) for i in pages]
args.extend(bargs)
args = [(cat, target_idspc) for cat in cat_list]
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(lambda p: self.get_data(*p), args):
data = future
itemid_results_list.extend(future)
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
if item not in itemid_results_list:
itemid_results_list.append(item['itemId'][0])
with open('raw_ids.txt', 'w') as f:
json.dump(itemid_results_list, f)
item_id_results = list(set(itemid_results_list))
item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
len(itemid_results_list), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
return item_id_results
@ -204,7 +225,7 @@ class ShoppingApi:
service_dict
fnd_srvc = input(str(service_dict) + "choose Finding call: (press 'enter' for default(4))")
pg_num = int(input('how many ids per cat? (7692 max)'))
target_idspc = int(input('how many ids per cat? (7692 max)'))
optional_params = {
"itemFilter(0).name":"Condition",
@ -213,10 +234,10 @@ class ShoppingApi:
if fnd_srvc != '':
fnd_srvc = int(fnd_srvc)
finding = FindingApi(fnd_srvc, pg_num)
finding = FindingApi(fnd_srvc, target_idspc)
else:
fnd_srvc = 4
finding = FindingApi(fnd_srvc, pg_num)
finding = FindingApi(fnd_srvc, target_idspc)
item_id_results = finding.get_ids_from_cats()
with concurrent.futures.ThreadPoolExecutor() as executor: