from bs4 import BeautifulSoup as b import re import json import requests import concurrent.futures import config as cfg def get_isurl(category_id): # "get itemSearchURL" ''' Gets raw JSON data fom FindingApi service call. Currently being used to get itemIDs from categories; ''' params = { "OPERATION-NAME":'findItemsByCategory', "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, "paginationInput.entriesPerPage":"1", "paginationInput.PageNumber":1, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used", "itemFilter.name":"HideDuplicateItems", "itemFilter.value":"true", } try: response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", params=params, timeout=24) response.raise_for_status() except requests.exceptions.RequestException: print('connection error') return url try: data = response.json() print(data) # NOTE approx 220 pages of listings per cat @ 35 items per page item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned item_cond_new = '&LH_ItemCondition=3' urls = [] base_url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] for pg in list(range(1,34)): # No results after around page 32 url = base_url+"&_pgn="+str(pg)+item_cond print(url) urls.append(url) except (AttributeError, KeyError): print('AttributeError or KeyError. Exiting') return urls def threaded_urls(): urls = [] with open('cat_list.txt') as jf: cat_list = json.load(jf) with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_isurl, cat_list): urls.extend(future) return urls def get_ids(url): ''' Scrapes listing links for item ID in url ''' html = requests.get(url).text soup = b(html, "html.parser") print(soup) ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids] print(ids) return ids def threaded_get_ids(urls): ''' Runs get_ids() w/in ThreadPoolExecutor() for multi threaded requests. Constructs and saves unique ids and 20_itemIDs for use with ebay_api methods ''' try: with open('ids.txt') as f: ids = json.load(f) except FileNotFoundError: ids = [] with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_ids, urls): ids.extend(future) ids = list(set(ids)) # necessary; two links are returned with pattern match item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints with open('ids.txt', 'w') as f: json.dump(ids,f) with open('item_id_results.txt', 'w') as f: json.dump(item_id_results, f) return item_id_results def id_count(): ''' Counts Unique IDs of item_id_results for testing ''' with open('item_id_results.txt') as f: item_id_results = json.load(f) ids = ','.join(item_id_results) ids = ids.split(',') uniq = len(list(set(ids))) print('{} Unique IDs'.format(uniq)) return ids def main(): urls = threaded_urls() item_id_results = threaded_get_ids(urls) return item_id_results if __name__=="__main__": main()