traditional id scraper with BeautifulSoup

2021-11-25 18:31:54 -07:00
parent eb780faf40
commit ab064ce5cf
1 changed files with 68 additions and 0 deletions
--- a/scrape_ids.py
+++ b/scrape_ids.py
@@ -0,0 +1,68 @@
+from bs4 import BeautifulSoup as b
+import re
+import json
+import requests
+import concurrent.futures
+import config as cfg
+
+def get_isurl(category_id): # "get itemSearchURL"
+
+    '''
+    Gets raw JSON data fom FindingApi service call. Currently being used to
+    get itemIDs from categories;
+    '''
+    params = {
+        "OPERATION-NAME":'findItemsByCategory',
+        "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
+        "SERVICE-VERSION":"1.13.0",
+        "RESPONSE-DATA-FORMAT":"JSON",
+        "categoryId":category_id,
+        "paginationInput.entriesPerPage":"1",
+        "paginationInput.PageNumber":1,
+        "itemFilter(0).name":"Condition",
+        "itemFilter(0).value":"Used",
+        "itemFilter.name":"HideDuplicateItems",
+        "itemFilter.value":"true",
+        }
+    
+    try:
+        response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
+            params=params, timeout=24)
+        response.raise_for_status()
+
+    except requests.exceptions.RequestException:
+        print('connection error')
+        return url
+    try:
+        data = response.json()
+        i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page
+        pg = "&_pgn={}".format(str(i))
+        item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
+        url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg)
+        url = url+item_cond
+
+    except (AttributeError, KeyError):
+        print('AttributeError or KeyError. Exiting')
+        print(response.json())
+        return data
+
+    return url
+
+def threaded_urls(url):
+
+    urls = []
+    with open('cat_list.txt') as jf:
+        cat_list = json.load(jf)
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        for future in executor.map(get_isurl, cat_list):
+            urls.append(future)
+
+    return urls
+
+    html = requests.get(url).text
+    soup = b(html, "html.parser")
+    ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
+    ids = [id['href'] for id in ids]
+    ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids]
+