scrape_ids working. Added function to conky in ebay_api

2021-11-26 18:46:51 -07:00
parent ab064ce5cf
commit 0f65ab0bc2
2 changed files with 41 additions and 12 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -1,4 +1,5 @@
 import os
+import scrape_ids
 from datetime import datetime, timedelta
 import dateutil
 from dateutil import parser
@@ -261,7 +262,7 @@ class ShoppingApi:
            fnd_srvc = 4
            finding = FindingApi(fnd_srvc, target_idspc)

-        item_id_results = finding.get_ids_from_cats()
+        item_id_results = scrape_ids.main()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
                for item in future:
--- a/scrape_ids.py
+++ b/scrape_ids.py
@@ -11,6 +11,7 @@ def get_isurl(category_id): # "get itemSearchURL"
    Gets raw JSON data fom FindingApi service call. Currently being used to
    get itemIDs from categories;
    '''
+
    params = {
        "OPERATION-NAME":'findItemsByCategory',
        "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
@@ -35,20 +36,23 @@ def get_isurl(category_id): # "get itemSearchURL"
        return url
    try:
        data = response.json()
-        i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page
-        pg = "&_pgn={}".format(str(i))
+        # NOTE approx 220 pages of listings per cat @ 35 items per page
        item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
-        url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg)
+        urls = []
+        url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] 
        url = url+item_cond
-
+        j = list(range(1,221))
+        for i in j:
+            pg = "&_pgn={}".format(str(i))
+            url = url.replace('&_pgn=1', pg)
+            urls.append(url)
+            
    except (AttributeError, KeyError):
        print('AttributeError or KeyError. Exiting')
-        print(response.json())
-        return data

-    return url
+    return urls

-def threaded_urls(url):
+def threaded_urls():

    urls = []
    with open('cat_list.txt') as jf:
@@ -56,13 +60,37 @@ def threaded_urls(url):

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for future in executor.map(get_isurl, cat_list):
-            urls.append(future)
+            urls.extend(future)

    return urls

+def get_ids(url):
    html = requests.get(url).text
    soup = b(html, "html.parser")
-    ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
+    ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
    ids = [id['href'] for id in ids]
-    ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids]
+    ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
+    ids = list(set(ids))

+    return ids
+
+def threaded_get_ids(urls):
+    
+    ids = []
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        for future in executor.map(get_ids, urls):
+            ids.extend(future)
+        with open('ids.txt', 'w') as f:
+            json.dump(ids, f)
+    item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
+        len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
+
+    return item_id_results
+def main():
+    urls = threaded_urls()
+    item_id_results = threaded_get_ids(urls)
+    return item_id_results
+
+if __name__=="__main__":
+    main()