unique ids fix

This commit is contained in:
scott 2021-12-01 13:43:14 -07:00
parent 93297f952e
commit 29800af1d4

View File

@ -36,15 +36,15 @@ def get_isurl(category_id): # "get itemSearchURL"
return url
try:
data = response.json()
print(data)
# NOTE approx 220 pages of listings per cat @ 35 items per page
item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
item_cond_new = '&LH_ItemCondition=3'
urls = []
url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0]
url = url+item_cond
j = list(range(1,221))
for i in j:
pg = "&_pgn={}".format(str(i))
url = url.replace('&_pgn=1', pg)
base_url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0]
for pg in list(range(1,34)): # No results after around page 32
url = base_url+"&_pgn="+str(pg)+item_cond
print(url)
urls.append(url)
except (AttributeError, KeyError):
@ -70,17 +70,22 @@ def get_ids(url):
'''
html = requests.get(url).text
soup = b(html, "html.parser")
print(soup)
ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
ids = [id['href'] for id in ids]
ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
ids = list(set(ids)) # necessary; two links are returned with pattern match
print(ids)
return ids
def threaded_get_ids(urls):
'''
Runs get_ids() w/in ThreadPoolExecutor() for multi threaded requests.
Constructs and saves unique ids and 20_itemIDs for use with ebay_api
methods
'''
try:
with open('item_id_results.txt') as f:
with open('ids.txt') as f:
ids = json.load(f)
except FileNotFoundError:
ids = []
@ -89,14 +94,32 @@ def threaded_get_ids(urls):
for future in executor.map(get_ids, urls):
ids.extend(future)
ids = list(set(ids)) # necessary; two links are returned with pattern match
item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
with open('ids.txt', 'w') as f:
json.dump(ids,f)
with open('item_id_results.txt', 'w') as f:
json.dump(item_id_results, f)
return item_id_results
def id_count():
'''
Counts Unique IDs of item_id_results for testing
'''
with open('item_id_results.txt') as f:
item_id_results = json.load(f)
ids = ','.join(item_id_results)
ids = ids.split(',')
uniq = len(list(set(ids)))
print('{} Unique IDs'.format(uniq))
return ids
def main():
urls = threaded_urls()
item_id_results = threaded_get_ids(urls)