diff --git a/curate.py b/curate.py index 45cbd04..e505cff 100644 --- a/curate.py +++ b/curate.py @@ -12,7 +12,9 @@ class_training = curate.class_training(training) # creates initial class_trainin nvl_training = curate.nvl_training(training) # creates initial nvl_training dropd = curate.drop_nvl_cols(nvl_training) # label mask dropd -expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs + +# pulls values out of lists for both dfs and creates temp_pics_source_list.txt +expanded_dfs = curate.expand_nvlclass(class_training, dropd) expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func. Specifically "reindex" usage @@ -20,7 +22,7 @@ expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func. download = input('download images?: ') if ('y' or 'Y') in download: with open('temp_pics_source_list.txt') as f: - test_list = json.load(f) - curate.dl_pictures(test_list) + url_list = json.load(f) + curate.dl_pictures(url_list) else: pass diff --git a/ebay_api.py b/ebay_api.py index 1cdaa19..da47a63 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -168,7 +168,7 @@ class ShoppingApi: def get_item_from_findItemsByCategory(self, twenty_id): ''' - Gets raw JSON data from multiple live listings given multiple itemIds + Gets raw JSON data from multiple live listings given multiple itemIds ''' headers = { @@ -212,10 +212,11 @@ class ShoppingApi: for item in future: data.append(item) # The end result should be a list of dicts where each dict in the list is a listing else: - print('reached call limit') + print('response is None') break with open('raw_data.txt', 'w') as f: json.dump(data, f) + return data # NOTE: @@ -359,10 +360,7 @@ class CurateData: expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values - temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # TODO because var is del after dl_pictures you may be - # getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such - # giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in - # computate power reqs. So, figure out a way to make a true temp list based on the current call executed + temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) else: class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) @@ -373,6 +371,8 @@ class CurateData: expanded_dropd = dropd expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values + + # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) try: @@ -387,29 +387,17 @@ class CurateData: json.dump(temp_pics_source_list, f) # Append to master training dataframes, drop potential dupes and save - expanded_class.to_csv('expanded_class.csv') - # expanded_class = pd.read_csv('expanded_class.csv', index_col=0) - # expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - # expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies - expanded_dropd.to_csv('expanded_dropd.csv') - # expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0) - # expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - # expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') return expanded_class, expanded_dropd - def dl_pictures(self, *args): + def dl_pictures(self): ''' Downloads pictures from api to local storage using temp_pics_source_list and creates custom {source:target} dictionary as dict_pics ''' - # TODO add option to include only first image of each listing as - # others may be crappy for training. Also consider adding option to - # reduce the size of each pic downloaded - try: with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments target_dir = json.load(f) @@ -428,45 +416,35 @@ class CurateData: with open('temp_pics_source_list.txt') as f: try: - if args: - temp_pics_source_list = args[0] - else: - temp_pics_source_list = json.load(f) + temp_pics_source_list = json.load(f) except (ValueError, FileNotFoundError): - if args: - temp_pics_sources_list = args[0] - else: - print('url list not found. download aborted') - return + print('url list not found. download aborted') + return - temp_dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} - - try: - with open('dict_pics.txt') as f: - dict_pics = json.load(f) - dict_pics.update(temp_dict_pics) # TODO This still creates duplicates - with open('dict_pics.txt', 'w') as f: - json.dump(dict_pics, f) - - except (ValueError, FileNotFoundError): - with open('dict_pics.txt', 'w') as f: - json.dump(temp_dict_pics, f) - dict_pics = temp_dict_pics + + dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} + with open('dict_pics.txt', 'w') as f: + json.dump(dict_pics, f) def dl_pic(dict_pics, pic): - if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work - pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why? - #TODO it would mean that temp_pics_source_list is changing for some reason? + try: - else: - try: - r = requests.get(pic, stream=True) - r.raw.decode_content = True - with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work - shutil.copyfileobj(r.raw, f) - except ConnectionError: - return + if os.path.exists(dict_pics[pic]): + pass # TODO should catch dupes, but make sure it is + + else: + try: + r = requests.get(pic, stream=True) + r.raw.decode_content = True + with open(dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work + shutil.copyfileobj(r.raw, f) + except ConnectionError: + + return + + except KeyError: + pass bargs = [(dict_pics, pic) for pic in temp_pics_source_list] with concurrent.futures.ThreadPoolExecutor() as executor: