from bs4 import BeautifulSoup as b import re import json import requests import concurrent.futures import config as cfg def get_isurl(category_id): # "get itemSearchURL" ''' Gets raw JSON data fom FindingApi service call. Currently being used to get itemIDs from categories; ''' params = { "OPERATION-NAME":'findItemsByCategory', "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, "paginationInput.entriesPerPage":"1", "paginationInput.PageNumber":1, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used", "itemFilter.name":"HideDuplicateItems", "itemFilter.value":"true", } try: response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", params=params, timeout=24) response.raise_for_status() except requests.exceptions.RequestException: print('connection error') return url try: data = response.json() # NOTE approx 220 pages of listings per cat @ 35 items per page item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned urls = [] url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] url = url+item_cond j = list(range(1,221)) for i in j: pg = "&_pgn={}".format(str(i)) url = url.replace('&_pgn=1', pg) urls.append(url) except (AttributeError, KeyError): print('AttributeError or KeyError. Exiting') return urls def threaded_urls(): urls = [] with open('cat_list.txt') as jf: cat_list = json.load(jf) with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_isurl, cat_list): urls.extend(future) return urls def get_ids(url): ''' Scrapes listing links for item ID in url ''' html = requests.get(url).text soup = b(html, "html.parser") ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids] ids = list(set(ids)) # necessary; two links are returned with pattern match return ids def threaded_get_ids(urls): ids = [] with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_ids, urls): ids.extend(future) item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints with open('ids.txt', 'w') as f: json.dump(ids, f) return item_id_results def main(): urls = threaded_urls() item_id_results = threaded_get_ids(urls) return item_id_results if __name__=="__main__": main()