dl_pictures and curate.py cleanup

2022-01-07 02:47:22 -07:00 · 2022-01-07 02:47:22 -07:00 · ae25ab88b6
commit ae25ab88b6
parent e879e80910
2 changed files with 35 additions and 55 deletions
--- a/curate.py
+++ b/curate.py
@ -12,7 +12,9 @@ class_training = curate.class_training(training) # creates initial class_trainin
 nvl_training = curate.nvl_training(training) # creates initial nvl_training
 dropd = curate.drop_nvl_cols(nvl_training)  # label mask
 dropd
-expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs
+
+# pulls values out of lists for both dfs and creates temp_pics_source_list.txt
+expanded_dfs = curate.expand_nvlclass(class_training, dropd)

 expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col
 expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func. Specifically "reindex" usage
@ -20,7 +22,7 @@ expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func.
 download = input('download images?: ')
 if ('y' or 'Y') in download:
    with open('temp_pics_source_list.txt') as f:
-        test_list = json.load(f)
-    curate.dl_pictures(test_list)
+        url_list = json.load(f)
+    curate.dl_pictures(url_list)
 else:
    pass
--- a/ebay_api.py
+++ b/ebay_api.py
@ -168,7 +168,7 @@ class ShoppingApi:
    def get_item_from_findItemsByCategory(self, twenty_id):

        '''
-        Gets raw JSON data from multiple live listings given multiple itemIds
+       Gets raw JSON data from multiple live listings given multiple itemIds
        '''

        headers = {
@ -212,10 +212,11 @@ class ShoppingApi:
                    for item in future:
                        data.append(item) # The end result should be a list of dicts where each dict in the list is a listing 
                else:
-                    print('reached call limit')
+                    print('response is None')
                    break
        with open('raw_data.txt', 'w') as f:
            json.dump(data, f)
+        return data

 # NOTE:

@ -359,10 +360,7 @@ class CurateData:

            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

-            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # TODO because var is del after dl_pictures you may be
-            # getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such
-            # giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in 
-            # computate power reqs. So, figure out a way to make a true temp list based on the current call executed
+            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) 

        else:
            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
@ -373,6 +371,8 @@ class CurateData:
            expanded_dropd = dropd

            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
+
+            # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))

        try:
@ -387,29 +387,17 @@ class CurateData:
                json.dump(temp_pics_source_list, f)

        # Append to master training dataframes, drop potential dupes and save
-
        expanded_class.to_csv('expanded_class.csv')
-        # expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
-        # expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        # expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
-
        expanded_dropd.to_csv('expanded_dropd.csv')
-        # expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
-        # expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        # expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')

        return expanded_class, expanded_dropd

-    def dl_pictures(self, *args):
+    def dl_pictures(self):
        '''
        Downloads pictures from api to local storage using temp_pics_source_list
        and creates custom {source:target} dictionary as dict_pics
        '''

-        # TODO add option to include only first image of each listing as
-        # others may be crappy for training. Also consider adding option to
-        # reduce the size of each pic downloaded
-
        try:
            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
                target_dir = json.load(f)
@ -428,45 +416,35 @@ class CurateData:

        with open('temp_pics_source_list.txt') as f:
            try:
-                if args:
-                    temp_pics_source_list = args[0]
-                else:
-                    temp_pics_source_list = json.load(f)
+                temp_pics_source_list = json.load(f)
            except (ValueError, FileNotFoundError):
-                if args:
-                    temp_pics_sources_list = args[0]
-                else:
-                    print('url list not found. download aborted')
-                    return
+                print('url list not found. download aborted')
+                return

-        temp_dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
-
-        try:
-            with open('dict_pics.txt') as f:
-                dict_pics = json.load(f)
-                dict_pics.update(temp_dict_pics) # TODO This still creates duplicates
-            with open('dict_pics.txt', 'w') as f:
-                json.dump(dict_pics, f)
-
-        except (ValueError, FileNotFoundError):
-            with open('dict_pics.txt', 'w') as f:
-                json.dump(temp_dict_pics, f)
-                dict_pics = temp_dict_pics
+        
+        dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
+        with open('dict_pics.txt', 'w') as f:
+            json.dump(dict_pics, f)

        def dl_pic(dict_pics, pic):

-            if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
-                pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
-            #TODO it would mean that temp_pics_source_list is changing for some reason?
+            try:

-            else:
-                try:
-                    r = requests.get(pic, stream=True)
-                    r.raw.decode_content = True
-                    with open(temp_dict_pics[pic], 'wb') as  f: # Or call dict_pics[pic] can work
-                        shutil.copyfileobj(r.raw, f)
-                except ConnectionError:
-                    return
+                if os.path.exists(dict_pics[pic]):
+                    pass # TODO should catch dupes, but make sure it is
+
+                else:
+                    try:
+                        r = requests.get(pic, stream=True)
+                        r.raw.decode_content = True
+                        with open(dict_pics[pic], 'wb') as  f: # Or call dict_pics[pic] can work
+                            shutil.copyfileobj(r.raw, f)
+                    except ConnectionError:
+
+                        return
+
+            except KeyError:
+                pass

        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
        with concurrent.futures.ThreadPoolExecutor() as executor: