From 29800af1d4bc112e83b3b7e974e00dd6cdb7c461 Mon Sep 17 00:00:00 2001 From: scott Date: Wed, 1 Dec 2021 13:43:14 -0700 Subject: [PATCH] unique ids fix --- scrape_ids.py | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/scrape_ids.py b/scrape_ids.py index 8f0f931..fdfa160 100644 --- a/scrape_ids.py +++ b/scrape_ids.py @@ -36,15 +36,15 @@ def get_isurl(category_id): # "get itemSearchURL" return url try: data = response.json() + print(data) # NOTE approx 220 pages of listings per cat @ 35 items per page item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned + item_cond_new = '&LH_ItemCondition=3' urls = [] - url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] - url = url+item_cond - j = list(range(1,221)) - for i in j: - pg = "&_pgn={}".format(str(i)) - url = url.replace('&_pgn=1', pg) + base_url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] + for pg in list(range(1,34)): # No results after around page 32 + url = base_url+"&_pgn="+str(pg)+item_cond + print(url) urls.append(url) except (AttributeError, KeyError): @@ -70,17 +70,22 @@ def get_ids(url): ''' html = requests.get(url).text soup = b(html, "html.parser") + print(soup) ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids] - ids = list(set(ids)) # necessary; two links are returned with pattern match + print(ids) return ids def threaded_get_ids(urls): - + ''' + Runs get_ids() w/in ThreadPoolExecutor() for multi threaded requests. + Constructs and saves unique ids and 20_itemIDs for use with ebay_api + methods + ''' try: - with open('item_id_results.txt') as f: + with open('ids.txt') as f: ids = json.load(f) except FileNotFoundError: ids = [] @@ -89,14 +94,32 @@ def threaded_get_ids(urls): for future in executor.map(get_ids, urls): ids.extend(future) + ids = list(set(ids)) # necessary; two links are returned with pattern match item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints + with open('ids.txt', 'w') as f: + json.dump(ids,f) + with open('item_id_results.txt', 'w') as f: json.dump(item_id_results, f) return item_id_results +def id_count(): + ''' + Counts Unique IDs of item_id_results for testing + ''' + with open('item_id_results.txt') as f: + item_id_results = json.load(f) + + ids = ','.join(item_id_results) + ids = ids.split(',') + uniq = len(list(set(ids))) + print('{} Unique IDs'.format(uniq)) + + return ids + def main(): urls = threaded_urls() item_id_results = threaded_get_ids(urls)