changing custom dict to dl_pictures function and making temp_dict_pics.txt file with target_dirs.txt

2021-04-09 22:37:14 -07:00 · 2021-04-09 22:37:14 -07:00 · fb327a9375
commit fb327a9375
parent ef237b8a1b
2 changed files with 75 additions and 34 deletions
--- a/curate.py
+++ b/curate.py
@ -15,7 +15,6 @@ expanded_dfs = curate.expand_nvlclass(class_training, dropd)

 expanded_class = expanded_dfs[0]
 expanded_dropd = expanded_dfs[1]
-dict_pics = expanded_dfs[2]
+# dict_pics = expanded_dfs[2]

-# TODO # need to replace expanded df's PictureURL col values with destination urls
 # TODO # still have the problem of duplicate listings. Possibly take care of this before you run curate
--- a/ebay_api.py
+++ b/ebay_api.py
@ -1,11 +1,11 @@
 import importlib
+import os
 import numpy as np
 import concurrent.futures
 import json
 import requests
 import pandas as pd
 import config as cfg
-import wget # NOTE may not need this
 import shutil
 import re

@ -125,7 +125,7 @@ class ShoppingApi:
        try:
            response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
            response.raise_for_status()
-        except request.exceptions.RequestException:
+        except requests.exceptions.RequestException:
            print('connection error')
        response = response.json()
        response = response['Item']
@ -251,14 +251,14 @@ class CurateData:
            dropd = nvl_training.drop(col_drop, axis=1)
        return dropd

-    def combine_nvlclass(self, class_training, dropd):
-        final_training = pd.concat([class_training, dropd], axis=1)
-        return final_training # TODO might not need this function
-
-    def make_dict_pics(self, expanded_class_training):
-        with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
-            dict_pics = json.load(jf)
-        dict_pics.extend('<')
+#    def combine_nvlclass(self, class_training, dropd):
+#        final_training = pd.concat([class_training, dropd], axis=1)
+#        return final_training # TODO might not need this function
+#
+#    def make_dict_pics(self, expanded_class_training):
+#        with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
+#            dict_pics = json.load(jf)
+#        dict_pics.extend('<')


    def expand_nvlclass(self, class_training, dropd):
@ -269,54 +269,96 @@ class CurateData:
        '''
        expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
        expanded_class = expanded_class.dropna(subset=['PictureURL'])
-        expanded_class = expanded_class.drop_duplicates(subset=['PictureURL'])
-        expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
+        expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        # expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
        expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
-        expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL'])
-        expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
+        expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        # expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])

        expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

        dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
        destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only 
        # defined in the download function
-        dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
-        expanded_class = expanded_class
-        # with open('dict_pics.txt','w+') as f: # TODO open if it exists, or write if not, then extend the dictionary with dict_pics

+#        '''Will use temp_dict_pics for changing the training set at preprocessing'''
+#        temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
+#        # TODO decide if the above is necesssary at this point or if it should
+#        # be created at preprocessing or download
+#
+#        with open('dict_pics.txt', 'w') as f:
+#            try:
+#                dict_pics = json.load(f)
+#                dict_pics.update(temp_dict_pics)
+#                json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions
+#
+#            except ValueError:
+#                json.dump(temp_dict_pics, f)

-        return expanded_class, expanded_dropd, dict_pics # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
+        with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor
+            try:
+                dict_pics_list = json.load(f)
+                dict_pics_list.append(dict_pics_list)
+                json.dump(dict_pics_list, f)
+            
+            except ValueError:
+                json.dump(dict_pics_list, f)
+
+        return expanded_class, expanded_dropd

    def dl_pictures(self):
        '''
        Downloads pictures from api to local storage using custom master dict
        '''

-        with open('dict_pic.txt', 'w+') as jf: # avoid duplicate logic goes here... I think
+        with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
+            try:
+                target_dir = json.load(f)
+            except ValueError:
+                target_dir = input('No default directory found. Create One? [y] or [n]:')
+        with open('dict_pics.txt') as jf:
            dict_pics = json.load(jf)

-        r = requests.get('<dict_pic>', stream=True)
-        r.raw.decode_content = True
-        filename = '<your destination + naming schem.jpg>'
-        with open(filename, 'wb') as  f:
-            shutil.copyfileobj(r.raw, f)
+        with open('dict_pics_list.txt') as f:
+            dict_pics_list = json.load(f)

-        # NOTE consider adding this dl_pictures func inside another func that uses 
-        # threading to fund the dl_pictures func here somewhere
+        def dl_pic(pic):

-        # PictureURL in PictureURL list can't be downloaded....have to use indirect address in the form https://i.ebayimg.com/images/g/<unique code>/s-l<size>.jpg
-        # in place of https://i.ebayimg.com/00/s/ODQwWDE2MDA=/z/<unique code>/$_1.JPG or use requests methods instead of wget and original PictureURL? yes, use requests
+            if os.path.exists(dict_pics[pic]):
+                pass
+            else:
+                r = requests.get(pic, stream=True)
+                r.raw.decode_content = True
+                with open(dict_pics[pic], 'wb') as  f: # might not work?
+                    shutil.copyfileobj(r.raw, f)
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for future in executor.map(dl_pic, dict_pics_list):
+                future
+
+        with open('dict_pics_list.txt','w') as f:
+            dict_pics_list = []
+            json.dump(dict_pics_list, f)
+
+        temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
+        # TODO decide if the above is necesssary at this point or if it should
+        # be created at preprocessing or download
+
+        with open('dict_pics.txt', 'w') as f:
+            try:
+                dict_pics = json.load(f)
+                dict_pics.update(temp_dict_pics)
+                json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions
+
+            except ValueError:
+                json.dump(temp_dict_pics, f)

        # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
        # if not exists and append to master img download dict
        # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted

-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(download_function, master_url_dict):
-                pass
-
    def update_df(self, data): # TODO save raw df as csv file
        '''
        Creates training instances for dataset. picture_url_list expanded to