debugging dl_pictures

2021-04-15 18:26:42 -07:00
parent 34a6451400
commit 5532a82d8d
1 changed files with 54 additions and 67 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -125,10 +125,13 @@ class ShoppingApi:
        try:
            response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
            response.raise_for_status()
+
        except requests.exceptions.RequestException:
            print('connection error')
+
        response = response.json()
        response = response['Item']
+
        return response

    def conky(self):
@@ -136,8 +139,8 @@ class ShoppingApi:
        For some reason item_id_results can only be passed as argument in executor.map
        if the variable is made within function
        '''
-        data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
-        finding = FindingApi(4, 2)
+        data = []
+        finding = FindingApi(4, 2) # TODO replace these test values before production
        item_id_results = finding.get_ids_from_cats()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
@@ -199,6 +202,9 @@ class CurateData:
        '''
        interm_df1 = pd.Series(training.ItemSpecifics)
        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
+
+        # Necessary for json_normalize(): 
+
        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
        nvl_df = pd.json_normalize(nvl_dict)
        nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
@@ -251,21 +257,11 @@ class CurateData:
            dropd = nvl_training.drop(col_drop, axis=1)
        return dropd

-#    def combine_nvlclass(self, class_training, dropd):
-#        final_training = pd.concat([class_training, dropd], axis=1)
-#        return final_training # TODO might not need this function
-#
-#    def make_dict_pics(self, expanded_class_training):
-#        with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
-#            dict_pics = json.load(jf)
-#        dict_pics.extend('<')
-
-
    def expand_nvlclass(self, class_training, dropd):
        '''
        takes image url list from each cell and expands them into separate/duplicate
        instances. Modifies both class training and dropd dfs. Appends custom
-        image url dict {'source':'destination'}.
+        image url dict {'source':'target'}.
        '''
        expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
        expanded_class = expanded_class.dropna(subset=['PictureURL'])
@@ -278,77 +274,53 @@ class CurateData:

        expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

-        pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
-        destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only 
+        temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
        # defined in the download function

-#        '''Will use temp_dict_pics for changing the training set at preprocessing'''
-#        temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
-#        # TODO decide if the above is necesssary at this point or if it should
-#        # be created at preprocessing or download
-#
-#        with open('dict_pics.txt', 'w') as f:
-#            try:
-#                dict_pics = json.load(f)
-#                dict_pics.update(temp_dict_pics)
-#                json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions
-#
-#            except ValueError:
-#                json.dump(temp_dict_pics, f)
-
-        with open('pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor
+        with open('temp_pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor
            try:
-                pics_source_list = json.load(f)
-                pics_source_list.append(pics_source_list)
-                json.dump(pics_source_list, f)
+                temp_pics_source_list = json.load(f)
+                temp_pics_source_list.append(temp_pics_source_list)
+                json.dump(temp_pics_source_list, f) # TODO This creates duplicates incorrectly
            except ValueError:
-                json.dump(pics_source_list, f)
+                json.dump(temp_pics_source_list, f)

        return expanded_class, expanded_dropd

-    def dl_pictures(self):
+    def dl_pictures(self, *args):
        '''
-        Downloads pictures from api to local storage using pics_source_list
+        Downloads pictures from api to local storage using temp_pics_source_list
        and creates custom {source:target} dictionary as dict_pics
        '''

-        with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
+        print('shitballs')
+        with open('target_dirs.txt', 'w+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
            try:
                target_dir = json.load(f)

            except ValueError:
                target_dir = input('No target directory found. Create One? [y] or [n]:')
-                if target_dir == 'y':
-                    target_dir = input('Please provide full URL to destination folder')
+                if target_dir == 'y' or 'Y':
+                    target_dir = input('Please provide full URL to destination folder:')
                else:
-                    print('Creating default folder in current directory')
                    target_dir = os.getcwd()
                    json.dump(target_dir, f)
+                    print('Creating default folder in current directory, ' + target_dir)

-        with open('dict_pics.txt') as jf:
-            dict_pics = json.load(jf)
+        with open('temp_pics_source_list.txt') as f:
+            try:
+                if args:
+                    temp_pics_source_list = args
+                else:
+                    temp_pics_source_list = json.load(f)
+            except ValueError:
+                if args:
+                    temp_pics_sources_list = args
+                else:
+                    print('url list not found. download aborted')
+                    return

-        with open('pics_source_list.txt') as f:
-            pics_source_list = json.load(f)
-
-        def dl_pic(pic):
-
-            if os.path.exists(dict_pics[pic]):
-                pass
-
-            else:
-                r = requests.get(pic, stream=True)
-                r.raw.decode_content = True
-                with open(dict_pics[pic], 'wb') as  f: # might not work?
-                    shutil.copyfileobj(r.raw, f)
-
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(dl_pic, pics_source_list):
-                future
-
-        temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
-        # TODO decide if the above is necesssary at this point or if it should
-        # be created at preprocessing or download
+        temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in temp_pics_source_list}

        with open('dict_pics.txt', 'w') as f:
            try:
@@ -359,9 +331,24 @@ class CurateData:
            except ValueError:
                json.dump(temp_dict_pics, f)

-        with open('pics_source_list.txt','w') as f:
-            pics_source_list = []
-            json.dump(pics_source_list, f)
+        def dl_pic(pic,dict_pics):
+
+            if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
+                pass
+
+            else:
+                r = requests.get(pic, stream=True)
+                r.raw.decode_content = True
+                with open(temp_dict_pics[pic], 'wb') as  f: # Or call dict_pics[pic] can work
+                    shutil.copyfileobj(r.raw, f)
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for future in executor.map(dl_pic, temp_pics_source_list):
+                future
+
+        with open('temp_pics_source_list.txt','w') as f: # Overwrites old when complete
+            temp_pics_source_list = []
+            json.dump(temp_pics_source_list, f)

        # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
@@ -389,6 +376,6 @@ if __name__ == "__main__":

 # TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
 # TO AVOID HICCUPS WHEN CREATING DATASET
-# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. 
+# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. 

 # Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)