duplicate id fixes. full data not getting fetched

2021-11-15 21:01:02 -07:00
parent 6bdc94c8c6
commit 1faa4e86fd
1 changed files with 47 additions and 26 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -20,13 +20,13 @@ class FindingApi:
    Methods for accessing eBay's FindingApi services
    '''

-    def __init__(self, service, pageNumber):
+    def __init__(self, service, target_idspc): #target ids per cat
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
            'findItemsByProduct'
            ][service] # Currently using only index 4, i.e., service = 4
-        self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
+        self.target_idspc = target_idspc

    # examples of additional params you may want to add:
    # 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
@@ -35,7 +35,7 @@ class FindingApi:
    # 'StartTimeNewest'
    # HideDuplicateItems

-    def get_data(self, category_id, i):
+    def get_data(self, category_id, target_idspc):

        '''
        Gets raw JSON data fom FindingApi service call. Currently being used to
@@ -47,6 +47,8 @@ class FindingApi:
        Also consider using the exlude duplicates param and possibly others.
        research ebay api docs to find cadidates
        '''
+        i = 1
+        ids = []
        params = {
            "OPERATION-NAME":self.service,
            "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
@@ -63,16 +65,40 @@ class FindingApi:
            "itemFilter.value":'true'
            }

-        # TODO add try excepts here
-        try:
-            response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
-                params=params, timeout=4)
-            response.raise_for_status()
+        while len(ids) < target_idspc:
+            try:
+                response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
+                    params=params, timeout=7)
+                response.raise_for_status()

-        except requests.exceptions.RequestException:
-            print('connection error') #TODO DECIDE HOW TO HANDLE EXCEPTION
-        data = response.json()
-        return data
+            except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
+                print('connection error') #TODO DECIDE HOW TO HANDLE EXCEPTION
+                return ids
+            data = response.json()
+
+            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
+                if item not in ids:
+                    ids.append(item['itemId'][0])
+
+            ids = list(set(ids))
+            i += 1
+            params = {
+                "OPERATION-NAME":self.service,
+                "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
+                "SERVICE-VERSION":"1.13.0",
+                "RESPONSE-DATA-FORMAT":"JSON",
+                "categoryId":category_id,
+                "paginationInput.entriesPerPage":"100",
+                "paginationInput.PageNumber":i,
+                "itemFilter(0).name":"Condition",
+                "itemFilter(0).value":"Used", # recommended is conditionId instead but for some reason that doesn't work either
+                # but may not be necessary anyways if you can eleminate dupes. TODO Still need to fix to work. Results are likely better than new items w/ shitty brands and pics
+                "sortOrder":"StartTimeNewest",
+                "itemFilter.name":"HideDuplicateItems", # this isn't working or is only working per page
+                "itemFilter.value":'true'
+                }
+
+        return ids

 # TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe 
 # try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
@@ -87,27 +113,22 @@ class FindingApi:
        Creates a 20-itemId list to use for the ShoppingApi
        call
        '''
-        pages = self.pageNumber
+        target_idspc = self.target_idspc
+
        itemid_results_list = []

        with open('cat_list.txt') as jf:
            cat_list = json.load(jf)

-        args = []
-        for category_id in cat_list:
-
-            bargs = [(category_id, i) for i in pages]
-            args.extend(bargs)
+        args = [(cat, target_idspc) for cat in cat_list]

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(lambda p: self.get_data(*p), args):
-                data = future
+                itemid_results_list.extend(future)

-                for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
-                    if item not in itemid_results_list:
-                        itemid_results_list.append(item['itemId'][0])
+        with open('raw_ids.txt', 'w') as f:
+            json.dump(itemid_results_list, f)

-        item_id_results = list(set(itemid_results_list))
        item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
            len(itemid_results_list), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
        return item_id_results
@@ -204,7 +225,7 @@ class ShoppingApi:
        service_dict

        fnd_srvc = input(str(service_dict) + "choose Finding call: (press 'enter' for default(4))")
-        pg_num = int(input('how many ids per cat? (7692 max)'))
+        target_idspc = int(input('how many ids per cat? (7692 max)'))

        optional_params = {
                "itemFilter(0).name":"Condition",
@@ -213,10 +234,10 @@ class ShoppingApi:

        if fnd_srvc != '':
            fnd_srvc = int(fnd_srvc)
-            finding = FindingApi(fnd_srvc, pg_num)
+            finding = FindingApi(fnd_srvc, target_idspc)
        else:
            fnd_srvc = 4
-            finding = FindingApi(fnd_srvc, pg_num)
+            finding = FindingApi(fnd_srvc, target_idspc)

        item_id_results = finding.get_ids_from_cats()
        with concurrent.futures.ThreadPoolExecutor() as executor: