unique ids fix

2021-12-01 13:43:14 -07:00
parent 93297f952e
commit 29800af1d4
1 changed files with 32 additions and 9 deletions
--- a/scrape_ids.py
+++ b/scrape_ids.py
@@ -36,15 +36,15 @@ def get_isurl(category_id): # "get itemSearchURL"
        return url
    try:
        data = response.json()
+        print(data)
        # NOTE approx 220 pages of listings per cat @ 35 items per page
        item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
+        item_cond_new = '&LH_ItemCondition=3'
        urls = []
-        url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] 
-        url = url+item_cond
-        j = list(range(1,221))
-        for i in j:
-            pg = "&_pgn={}".format(str(i))
-            url = url.replace('&_pgn=1', pg)
+        base_url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] 
+        for pg in list(range(1,34)): # No results after around page 32
+            url = base_url+"&_pgn="+str(pg)+item_cond
+            print(url)
            urls.append(url)
            
    except (AttributeError, KeyError):
@@ -70,17 +70,22 @@ def get_ids(url):
    '''
    html = requests.get(url).text
    soup = b(html, "html.parser")
+    print(soup)
    ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
    ids = [id['href'] for id in ids]
    ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
-    ids = list(set(ids)) # necessary; two links are returned with pattern match
+    print(ids)

    return ids

 def threaded_get_ids(urls):
-    
+    '''
+    Runs get_ids() w/in ThreadPoolExecutor() for multi threaded requests.
+    Constructs and saves unique ids and 20_itemIDs for use with ebay_api
+    methods
+   ''' 
    try:
-        with open('item_id_results.txt') as f:
+        with open('ids.txt') as f:
            ids = json.load(f)
    except FileNotFoundError:
        ids = []
@@ -89,14 +94,32 @@ def threaded_get_ids(urls):
        for future in executor.map(get_ids, urls):
            ids.extend(future)

+    ids = list(set(ids)) # necessary; two links are returned with pattern match
    item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
    len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints

+    with open('ids.txt', 'w') as f:
+        json.dump(ids,f)
+
    with open('item_id_results.txt', 'w') as f:
        json.dump(item_id_results, f)

    return item_id_results

+def  id_count():
+    '''
+    Counts Unique IDs of item_id_results for testing
+    '''
+    with open('item_id_results.txt') as f:
+        item_id_results = json.load(f)
+    
+    ids = ','.join(item_id_results)
+    ids = ids.split(',')
+    uniq = len(list(set(ids)))
+    print('{} Unique IDs'.format(uniq))
+
+    return ids
+
 def main():
    urls = threaded_urls()
    item_id_results = threaded_get_ids(urls)