from bs4 import BeautifulSoup as b import re import json import requests import concurrent.futures import config as cfg def get_isurl(category_id): # "get itemSearchURL" ''' Gets raw JSON data fom FindingApi service call. Currently being used to get itemIDs from categories; ''' params = { "OPERATION-NAME":'findItemsByCategory', "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, "paginationInput.entriesPerPage":"1", "paginationInput.PageNumber":1, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used", "itemFilter.name":"HideDuplicateItems", "itemFilter.value":"true", } try: response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", params=params, timeout=24) response.raise_for_status() except requests.exceptions.RequestException: print('connection error') return url try: data = response.json() i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page pg = "&_pgn={}".format(str(i)) item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg) url = url+item_cond except (AttributeError, KeyError): print('AttributeError or KeyError. Exiting') print(response.json()) return data return url def threaded_urls(url): urls = [] with open('cat_list.txt') as jf: cat_list = json.load(jf) with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_isurl, cat_list): urls.append(future) return urls html = requests.get(url).text soup = b(html, "html.parser") ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids]