From ab064ce5cf5892b1b9d5420d3ed9ba965079292a Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Thu, 25 Nov 2021 18:31:54 -0700 Subject: [PATCH] traditional id scraper with BeautifulSoup --- scrape_ids.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 scrape_ids.py diff --git a/scrape_ids.py b/scrape_ids.py new file mode 100644 index 0000000..f9eda25 --- /dev/null +++ b/scrape_ids.py @@ -0,0 +1,68 @@ +from bs4 import BeautifulSoup as b +import re +import json +import requests +import concurrent.futures +import config as cfg + +def get_isurl(category_id): # "get itemSearchURL" + + ''' + Gets raw JSON data fom FindingApi service call. Currently being used to + get itemIDs from categories; + ''' + params = { + "OPERATION-NAME":'findItemsByCategory', + "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], + "SERVICE-VERSION":"1.13.0", + "RESPONSE-DATA-FORMAT":"JSON", + "categoryId":category_id, + "paginationInput.entriesPerPage":"1", + "paginationInput.PageNumber":1, + "itemFilter(0).name":"Condition", + "itemFilter(0).value":"Used", + "itemFilter.name":"HideDuplicateItems", + "itemFilter.value":"true", + } + + try: + response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", + params=params, timeout=24) + response.raise_for_status() + + except requests.exceptions.RequestException: + print('connection error') + return url + try: + data = response.json() + i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page + pg = "&_pgn={}".format(str(i)) + item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned + url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg) + url = url+item_cond + + except (AttributeError, KeyError): + print('AttributeError or KeyError. Exiting') + print(response.json()) + return data + + return url + +def threaded_urls(url): + + urls = [] + with open('cat_list.txt') as jf: + cat_list = json.load(jf) + + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(get_isurl, cat_list): + urls.append(future) + + return urls + + html = requests.get(url).text + soup = b(html, "html.parser") + ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) + ids = [id['href'] for id in ids] + ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids] +