Added method to update category IDs var and txt file

2021-05-10 15:14:57 -07:00
parent 6cfa798902
commit c8ab1b13d9
1 changed files with 75 additions and 23 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -11,7 +11,10 @@ import shutil
 import re

 class FindingApi:
-    '''Methods for accessing eBays FindingApi services'''
+    '''
+    Methods for accessing eBay's FindingApi services
+    '''
+
    def __init__(self, service, pageNumber):
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
@@ -20,13 +23,45 @@ class FindingApi:
            ][service]
        self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints

-    # departments = ["3034","93427"] (womens and mens)
-
    # examples of additional params you may want to add:
    # 'itemFilter(0).value':'Used'
    # 'itemFilter(1).name':'ListingType'
    # 'itemFilter(1).value':'AuctionWithBIN'

+    def update_cats(self):
+
+        parent_cats = ['3034', '93427']
+        cat_list = []
+
+        for department in parent_cats:
+
+            params = {
+                "callname":"GetCategoryInfo",
+                "appid":cfg.sec['SECURITY-APPNAME'],
+                "version":"671",
+                "responseencoding":"JSON",
+                "CategoryID":department,
+                "IncludeSelector":"ChildCategories",
+                }
+
+            try:
+                response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
+                response.raise_for_status()
+
+            except requests.exceptions.RequestException:
+                print('connection error')
+
+            response = response.json()
+            response = response['CategoryArray']['Category'][1:]
+            temp_cat_list = [cat['CategoryID'] for cat in response]
+            cat_list.extend(temp_cat_list)
+
+            with open('cat_list.txt', 'w') as f:
+                json.dump(cat_list, f)
+
+            # leaf_list = [node['LeafCategory'] for node in response]
+        return cat_list
+
    def get_data(self, category_id, i):

        '''
@@ -127,7 +162,7 @@ class ShoppingApi:
            response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
            response.raise_for_status()

-        except requests.exceptions.RequestException:
+        except requests.exceptions.RequestException: # TODO need better handling
            print('connection error')

        response = response.json()
@@ -137,8 +172,8 @@ class ShoppingApi:

    def conky(self):
        '''
-        For some reason item_id_results can only be passed as argument in executor.map
-        if the variable is made within function
+        Runs get_item_from_findItemsByCategory in multiple threads to get relevant
+        data for creating training sets
        '''
        data = []
        finding = FindingApi(4, 2) # TODO replace these test values before production
@@ -157,7 +192,7 @@ class ShoppingApi:

 class CurateData:
    '''
-    Contains functions for curating data for machine learning training sets;
+    Contains methods for curating data for machine learning training sets;
    Takes item in data from ShoppingApi request as argument and extracts/ creates key
    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''
@@ -220,7 +255,7 @@ class CurateData:

        return extracted_df

-    def drop_nvl_cols(self, nvl_training):
+    def drop_nvl_cols(self, nvl_training): # NOTE this is wonky
        col_drop = [
                'Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item',
                'Modification Description', 'Article Type', 'Customized',
@@ -281,15 +316,16 @@ class CurateData:

        try:
            with open('temp_pics_source_list.txt') as f:
-                temp_pics_source_list = json.load(f)
-                temp_pics_source_list.append(temp_pics_source_list)
-                temp_pics_source_list = list(set(temp_pics_source_list))
+                tpsl = json.load(f)
+                tpsl.extend(temp_pics_source_list)
+                temp_pics_source_list = list(set(tpsl))
                with open('temp_pics_source_list.txt', 'w') as f:
                    json.dump(temp_pics_source_list, f)
        except (ValueError, FileNotFoundError):
            with open('temp_pics_source_list.txt', 'w') as f:
                json.dump(temp_pics_source_list, f)

+        # TODO still need to save these as csv files
        return expanded_class, expanded_dropd

    def dl_pictures(self, *args):
@@ -301,16 +337,16 @@ class CurateData:
        try:
            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
                target_dir = json.load(f)
-
        except (ValueError, FileNotFoundError):
-            target_dir = input('No target dirctory found. Create One? [y] or [n]:')
+            target_dir = input('No target dirctory found. Create One? [y] or [n]:') 
            if target_dir == ('y' or 'Y'):
-                target_dir = input('Please provide full URL to destination folder:')
-                with open('target_dirs.txt','w+') as f:
+                target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
+                with open('target_dirs.txt','w') as f:
                    json.dump(target_dir, f)
            else:
-                target_dir = os.mkdir(os.getcwd()+os.sep+'training_images')
-                with open('target_dirs.txt','w+') as f:
+                os.mkdir(os.getcwd()+os.sep+'training_images')
+                target_dir = os.getcwd()+os.sep+'training_images'
+                with open('target_dirs.txt','w') as f:
                    json.dump(target_dir, f)
                    print('Creating default folder in current directory @ ' + target_dir)

@@ -332,7 +368,7 @@ class CurateData:
        try:
            with open('dict_pics.txt') as f:
                dict_pics = json.load(f)
-                dict_pics.update(temp_dict_pics)
+                dict_pics.update(temp_dict_pics) # TODO This still creates duplicates
            with open('dict_pics.txt', 'w') as f:
                json.dump(dict_pics, f)

@@ -344,7 +380,8 @@ class CurateData:
        def dl_pic(dict_pics, pic):

            if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
-                pass
+                pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
+            #TODO it would mean that temp_pics_source_list is changing for some reason?

            else:
                r = requests.get(pic, stream=True)
@@ -352,16 +389,31 @@ class CurateData:
                with open(temp_dict_pics[pic], 'wb') as  f: # Or call dict_pics[pic] can work
                    shutil.copyfileobj(r.raw, f)

-        breakpoint()
        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(lambda p: dl_pic(*p), bargs):
                future

-        with open('temp_pics_source_list.txt','w') as f: # Overwrites old when complete
-            temp_pics_source_list = []
-            json.dump(temp_pics_source_list, f)
+        os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully
+
+class PreProcessing:
+    '''
+    Includes methods for pre-processing training set input and labels in the 
+    training set created from CurateData class. Whereas CurateData training
+    sets provided trimmed down data from the raw json response from the
+    ShoppingApi call and provided a bare minimum format for the dataframe to be
+    used in training, PreProcessing optimizes that dataframe for training and 
+    includes methods for image manipulation, creating test/train/validation
+    splits, etc. 
+    '''
+
+    def stt_training(self, dict_pics, expanded_class, expanded_dropd):
+        '''
+        Source to target training. Replaces source image URL with target URL
+        determined by values in dict_pics variable.
+        '''
+        pass

        # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures