diff --git a/ebay_api.py b/ebay_api.py index cc26bd6..95f7bfd 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -1,4 +1,5 @@ import os +import scrape_ids from datetime import datetime, timedelta import dateutil from dateutil import parser @@ -261,7 +262,7 @@ class ShoppingApi: fnd_srvc = 4 finding = FindingApi(fnd_srvc, target_idspc) - item_id_results = finding.get_ids_from_cats() + item_id_results = scrape_ids.main() with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results): for item in future: diff --git a/scrape_ids.py b/scrape_ids.py index f9eda25..d0a93ab 100644 --- a/scrape_ids.py +++ b/scrape_ids.py @@ -11,6 +11,7 @@ def get_isurl(category_id): # "get itemSearchURL" Gets raw JSON data fom FindingApi service call. Currently being used to get itemIDs from categories; ''' + params = { "OPERATION-NAME":'findItemsByCategory', "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], @@ -35,20 +36,23 @@ def get_isurl(category_id): # "get itemSearchURL" return url try: data = response.json() - i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page - pg = "&_pgn={}".format(str(i)) + # NOTE approx 220 pages of listings per cat @ 35 items per page item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned - url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg) + urls = [] + url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] url = url+item_cond - + j = list(range(1,221)) + for i in j: + pg = "&_pgn={}".format(str(i)) + url = url.replace('&_pgn=1', pg) + urls.append(url) + except (AttributeError, KeyError): print('AttributeError or KeyError. Exiting') - print(response.json()) - return data - return url + return urls -def threaded_urls(url): +def threaded_urls(): urls = [] with open('cat_list.txt') as jf: @@ -56,13 +60,37 @@ def threaded_urls(url): with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_isurl, cat_list): - urls.append(future) + urls.extend(future) return urls +def get_ids(url): html = requests.get(url).text soup = b(html, "html.parser") - ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) + ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] - ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids] + ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids] + ids = list(set(ids)) + return ids + +def threaded_get_ids(urls): + + ids = [] + + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(get_ids, urls): + ids.extend(future) + with open('ids.txt', 'w') as f: + json.dump(ids, f) + item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, + len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints + + return item_id_results +def main(): + urls = threaded_urls() + item_id_results = threaded_get_ids(urls) + return item_id_results + +if __name__=="__main__": + main()