From 5532a82d8dbf582cd96f045bcd7870b2e1d7359d Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Thu, 15 Apr 2021 18:26:42 -0700 Subject: [PATCH] debugging dl_pictures --- ebay_api.py | 121 +++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 67 deletions(-) diff --git a/ebay_api.py b/ebay_api.py index 0423a9d..2272dc5 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -125,10 +125,13 @@ class ShoppingApi: try: response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) response.raise_for_status() + except requests.exceptions.RequestException: print('connection error') + response = response.json() response = response['Item'] + return response def conky(self): @@ -136,8 +139,8 @@ class ShoppingApi: For some reason item_id_results can only be passed as argument in executor.map if the variable is made within function ''' - data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though - finding = FindingApi(4, 2) + data = [] + finding = FindingApi(4, 2) # TODO replace these test values before production item_id_results = finding.get_ids_from_cats() with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results): @@ -199,6 +202,9 @@ class CurateData: ''' interm_df1 = pd.Series(training.ItemSpecifics) interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) + + # Necessary for json_normalize(): + nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) nvl_df = pd.json_normalize(nvl_dict) nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) @@ -251,21 +257,11 @@ class CurateData: dropd = nvl_training.drop(col_drop, axis=1) return dropd -# def combine_nvlclass(self, class_training, dropd): -# final_training = pd.concat([class_training, dropd], axis=1) -# return final_training # TODO might not need this function -# -# def make_dict_pics(self, expanded_class_training): -# with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up -# dict_pics = json.load(jf) -# dict_pics.extend('<') - - def expand_nvlclass(self, class_training, dropd): ''' takes image url list from each cell and expands them into separate/duplicate instances. Modifies both class training and dropd dfs. Appends custom - image url dict {'source':'destination'}. + image url dict {'source':'target'}. ''' expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object expanded_class = expanded_class.dropna(subset=['PictureURL']) @@ -278,77 +274,53 @@ class CurateData: expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values - pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call - destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only + temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call # defined in the download function -# '''Will use temp_dict_pics for changing the training set at preprocessing''' -# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list} -# # TODO decide if the above is necesssary at this point or if it should -# # be created at preprocessing or download -# -# with open('dict_pics.txt', 'w') as f: -# try: -# dict_pics = json.load(f) -# dict_pics.update(temp_dict_pics) -# json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions -# -# except ValueError: -# json.dump(temp_dict_pics, f) - - with open('pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor + with open('temp_pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor try: - pics_source_list = json.load(f) - pics_source_list.append(pics_source_list) - json.dump(pics_source_list, f) + temp_pics_source_list = json.load(f) + temp_pics_source_list.append(temp_pics_source_list) + json.dump(temp_pics_source_list, f) # TODO This creates duplicates incorrectly except ValueError: - json.dump(pics_source_list, f) + json.dump(temp_pics_source_list, f) return expanded_class, expanded_dropd - def dl_pictures(self): + def dl_pictures(self, *args): ''' - Downloads pictures from api to local storage using pics_source_list + Downloads pictures from api to local storage using temp_pics_source_list and creates custom {source:target} dictionary as dict_pics ''' - with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments + print('shitballs') + with open('target_dirs.txt', 'w+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments try: target_dir = json.load(f) except ValueError: target_dir = input('No target directory found. Create One? [y] or [n]:') - if target_dir == 'y': - target_dir = input('Please provide full URL to destination folder') + if target_dir == 'y' or 'Y': + target_dir = input('Please provide full URL to destination folder:') else: - print('Creating default folder in current directory') target_dir = os.getcwd() json.dump(target_dir, f) + print('Creating default folder in current directory, ' + target_dir) - with open('dict_pics.txt') as jf: - dict_pics = json.load(jf) + with open('temp_pics_source_list.txt') as f: + try: + if args: + temp_pics_source_list = args + else: + temp_pics_source_list = json.load(f) + except ValueError: + if args: + temp_pics_sources_list = args + else: + print('url list not found. download aborted') + return - with open('pics_source_list.txt') as f: - pics_source_list = json.load(f) - - def dl_pic(pic): - - if os.path.exists(dict_pics[pic]): - pass - - else: - r = requests.get(pic, stream=True) - r.raw.decode_content = True - with open(dict_pics[pic], 'wb') as f: # might not work? - shutil.copyfileobj(r.raw, f) - - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(dl_pic, pics_source_list): - future - - temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list} - # TODO decide if the above is necesssary at this point or if it should - # be created at preprocessing or download + temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in temp_pics_source_list} with open('dict_pics.txt', 'w') as f: try: @@ -359,9 +331,24 @@ class CurateData: except ValueError: json.dump(temp_dict_pics, f) - with open('pics_source_list.txt','w') as f: - pics_source_list = [] - json.dump(pics_source_list, f) + def dl_pic(pic,dict_pics): + + if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work + pass + + else: + r = requests.get(pic, stream=True) + r.raw.decode_content = True + with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work + shutil.copyfileobj(r.raw, f) + + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(dl_pic, temp_pics_source_list): + future + + with open('temp_pics_source_list.txt','w') as f: # Overwrites old when complete + temp_pics_source_list = [] + json.dump(temp_pics_source_list, f) # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures @@ -389,6 +376,6 @@ if __name__ == "__main__": # TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO # TO AVOID HICCUPS WHEN CREATING DATASET -# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. +# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. # Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)