saves list of 20_itemids instead of list of ids

This commit is contained in:
scott 2021-11-30 12:43:37 -07:00
parent 61e6770812
commit 97ab3c7dab
2 changed files with 7 additions and 145 deletions

View File

@ -23,147 +23,6 @@ from ebaysdk.finding import Connection as Finding
from ebaysdk.shopping import Connection as Shopping
class FindingApi:
'''
Methods for accessing eBay's FindingApi services
'''
def __init__(self, service, idspc):
self.service = [
'findItemsAdvanced', 'findCompletedItems',
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
'findItemsByProduct'
][service] # Currently using only index 4, i.e., service = 4
self.idspc = idspc # examples of additional params you may want to add:
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
# 'StartTimeNewest'
# HideDuplicateItems
def get_data(self, category_id, idspc):
'''
Gets raw JSON data fom FindingApi service call. Currently being used to
get itemIDs from categories;
'''
# startTime = dateutil.parser.isoparse( startTime )
# now = datetime.datetime.now(tz=pytz.UTC)
# days_on_site = (now - startTime).days # as int
ids = []
modTimeFrom = datetime.now() - timedelta(seconds=5) # initialize modTimeFrom value
i = 1
params = {
"OPERATION-NAME":self.service,
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"20",
"paginationInput.PageNumber":i,
"itemFilter(0).name":"Condition",
"itemFilter(0).value":"Used",
"itemFilter.name":"HideDuplicateItems",
"itemFilter.value":"true",
"sortOrder":"StartTimeNewest",
}
# "itemFilter.name(2)":"modTimeFrom",
# "itemFilter.value(2)":modTimeFrom,
while len(ids) < idspc:
try:
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
params=params, timeout=24)
response.raise_for_status()
except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
print('connection error')
return ids
try:
data = response.json()
itemSearchURL = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0]
modTimeFrom = data['findItemsByCategoryResponse'][0]['searchResult'][0]['item'][-1]['listingInfo'][0]['startTime'][0]
modTimeFrom = dateutil.parser.isoparse( modTimeFrom )
modTimeFrom = modTimeFrom - timedelta(seconds=5) # TODO NEED BACK TO GMT FORMAT
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
# if item not in ids:
ids.append(item['itemId'][0])
#ids = list(set(ids))
except (AttributeError, KeyError):
print('AttributeError or KeyError. Exiting')
print(response.json())
return ids
input('press enter to continue')
i+=1
params = {
"OPERATION-NAME":self.service,
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"20",
"paginationInput.PageNumber":i,
"itemFilter(0).name":"Condition",
"itemFilter(0).value":"Used",
"itemFilter.name":"HideDuplicateItems",
"itemFilter.value":"true",
"sortOrder":"StartTimeNewest",
}
return ids, data, modTimeFrom, itemSearchURL
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
# it to the first 5 pictures instead of random.
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
# to see which one is more accurate or if a combo of both is more accurate.
def get_ids_from_cats(self): #TODO need to resolve duplicates here to maximize unique ids/data and ShopppingApi call
'''
Creates a 20-itemId list to use for the ShoppingApi
call
'''
# target_idspc = self.target_idspc
idspc = self.idspc
itemid_results_list = []
with open('cat_list.txt') as jf:
cat_list = json.load(jf)
for cat in cat_list:
args = [(cat, idspc) for cat in cat_list]
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(lambda p: self.get_data(*p), args):
itemid_results_list.extend(future)
print(len(itemid_results_list))
a = list(set(itemid_results_list))
print(len(a))
input('press enter to continue')
with open('raw_ids.txt', 'w') as f:
json.dump(itemid_results_list, f)
item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
len(itemid_results_list), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
return item_id_results, itemid_results_list
# TODO during your try except conditionals just check the csv files. At the end you can create sets. You can creat another condition that says if the final set is smaller than 100k then you can call finding
# service on more pages (but only pages you haven't tried) and repeat the search process.
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
# for category_id in cat_list:
class ShoppingApi:
'''
Creates objects from ShoppingApi service calls that can interact with

View File

@ -65,12 +65,15 @@ def threaded_urls():
return urls
def get_ids(url):
'''
Scrapes listing links for item ID in url
'''
html = requests.get(url).text
soup = b(html, "html.parser")
ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
ids = [id['href'] for id in ids]
ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
ids = list(set(ids))
ids = list(set(ids)) # necessary; two links are returned with pattern match
return ids
@ -81,10 +84,10 @@ def threaded_get_ids(urls):
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(get_ids, urls):
ids.extend(future)
with open('ids.txt', 'w') as f:
json.dump(ids, f)
item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
with open('ids.txt', 'w') as f:
json.dump(ids, f)
return item_id_results
def main():