diff --git a/ebay_api.py b/ebay_api.py index 6515942..234c084 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -23,147 +23,6 @@ from ebaysdk.finding import Connection as Finding from ebaysdk.shopping import Connection as Shopping -class FindingApi: - ''' - Methods for accessing eBay's FindingApi services - ''' - - def __init__(self, service, idspc): - self.service = [ - 'findItemsAdvanced', 'findCompletedItems', - 'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory', - 'findItemsByProduct' - ][service] # Currently using only index 4, i.e., service = 4 - self.idspc = idspc # examples of additional params you may want to add: - # 'itemFilter(0).value':'Used' consider using this with findCompletedItems call - # 'itemFilter(1).name':'ListingType' - # 'itemFilter(1).value':'AuctionWithBIN' - # 'StartTimeNewest' - # HideDuplicateItems - - def get_data(self, category_id, idspc): - - ''' - Gets raw JSON data fom FindingApi service call. Currently being used to - get itemIDs from categories; - ''' -# startTime = dateutil.parser.isoparse( startTime ) -# now = datetime.datetime.now(tz=pytz.UTC) -# days_on_site = (now - startTime).days # as int - - ids = [] - modTimeFrom = datetime.now() - timedelta(seconds=5) # initialize modTimeFrom value - i = 1 - params = { - "OPERATION-NAME":self.service, - "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], - "SERVICE-VERSION":"1.13.0", - "RESPONSE-DATA-FORMAT":"JSON", - "categoryId":category_id, - "paginationInput.entriesPerPage":"20", - "paginationInput.PageNumber":i, - "itemFilter(0).name":"Condition", - "itemFilter(0).value":"Used", - "itemFilter.name":"HideDuplicateItems", - "itemFilter.value":"true", - "sortOrder":"StartTimeNewest", - } - -# "itemFilter.name(2)":"modTimeFrom", -# "itemFilter.value(2)":modTimeFrom, - - while len(ids) < idspc: - - try: - response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", - params=params, timeout=24) - response.raise_for_status() - - except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout? - print('connection error') - return ids - try: - data = response.json() - itemSearchURL = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0] - modTimeFrom = data['findItemsByCategoryResponse'][0]['searchResult'][0]['item'][-1]['listingInfo'][0]['startTime'][0] - modTimeFrom = dateutil.parser.isoparse( modTimeFrom ) - modTimeFrom = modTimeFrom - timedelta(seconds=5) # TODO NEED BACK TO GMT FORMAT - for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: -# if item not in ids: - ids.append(item['itemId'][0]) - - #ids = list(set(ids)) - - except (AttributeError, KeyError): - print('AttributeError or KeyError. Exiting') - print(response.json()) - return ids - - input('press enter to continue') - i+=1 - params = { - "OPERATION-NAME":self.service, - "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], - "SERVICE-VERSION":"1.13.0", - "RESPONSE-DATA-FORMAT":"JSON", - "categoryId":category_id, - "paginationInput.entriesPerPage":"20", - "paginationInput.PageNumber":i, - "itemFilter(0).name":"Condition", - "itemFilter(0).value":"Used", - "itemFilter.name":"HideDuplicateItems", - "itemFilter.value":"true", - "sortOrder":"StartTimeNewest", - } - - return ids, data, modTimeFrom, itemSearchURL - -# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe -# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a -# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting -# it to the first 5 pictures instead of random. - -# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used -# to see which one is more accurate or if a combo of both is more accurate. - - def get_ids_from_cats(self): #TODO need to resolve duplicates here to maximize unique ids/data and ShopppingApi call - ''' - Creates a 20-itemId list to use for the ShoppingApi - call - ''' -# target_idspc = self.target_idspc - idspc = self.idspc - - itemid_results_list = [] - - with open('cat_list.txt') as jf: - cat_list = json.load(jf) - - for cat in cat_list: - args = [(cat, idspc) for cat in cat_list] - - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(lambda p: self.get_data(*p), args): - itemid_results_list.extend(future) - - print(len(itemid_results_list)) - a = list(set(itemid_results_list)) - print(len(a)) - input('press enter to continue') - - with open('raw_ids.txt', 'w') as f: - json.dump(itemid_results_list, f) - - item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0, - len(itemid_results_list), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints - return item_id_results, itemid_results_list - -# TODO during your try except conditionals just check the csv files. At the end you can create sets. You can creat another condition that says if the final set is smaller than 100k then you can call finding -# service on more pages (but only pages you haven't tried) and repeat the search process. - -# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set() -# for category_id in cat_list: - class ShoppingApi: ''' Creates objects from ShoppingApi service calls that can interact with diff --git a/scrape_ids.py b/scrape_ids.py index d0a93ab..6a9ba64 100644 --- a/scrape_ids.py +++ b/scrape_ids.py @@ -65,12 +65,15 @@ def threaded_urls(): return urls def get_ids(url): + ''' + Scrapes listing links for item ID in url + ''' html = requests.get(url).text soup = b(html, "html.parser") ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)"))) ids = [id['href'] for id in ids] ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids] - ids = list(set(ids)) + ids = list(set(ids)) # necessary; two links are returned with pattern match return ids @@ -81,10 +84,10 @@ def threaded_get_ids(urls): with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(get_ids, urls): ids.extend(future) - with open('ids.txt', 'w') as f: - json.dump(ids, f) item_id_results = [','.join(ids[n:n+20]) for n in list(range(0, - len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints + len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints + with open('ids.txt', 'w') as f: + json.dump(ids, f) return item_id_results def main():