dl_pictures and curate.py cleanup

This commit is contained in:
scott 2022-01-07 02:47:22 -07:00
parent e879e80910
commit ae25ab88b6
2 changed files with 35 additions and 55 deletions

View File

@ -12,7 +12,9 @@ class_training = curate.class_training(training) # creates initial class_trainin
nvl_training = curate.nvl_training(training) # creates initial nvl_training
dropd = curate.drop_nvl_cols(nvl_training) # label mask
dropd
expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs
# pulls values out of lists for both dfs and creates temp_pics_source_list.txt
expanded_dfs = curate.expand_nvlclass(class_training, dropd)
expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col
expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func. Specifically "reindex" usage
@ -20,7 +22,7 @@ expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func.
download = input('download images?: ')
if ('y' or 'Y') in download:
with open('temp_pics_source_list.txt') as f:
test_list = json.load(f)
curate.dl_pictures(test_list)
url_list = json.load(f)
curate.dl_pictures(url_list)
else:
pass

View File

@ -168,7 +168,7 @@ class ShoppingApi:
def get_item_from_findItemsByCategory(self, twenty_id):
'''
Gets raw JSON data from multiple live listings given multiple itemIds
Gets raw JSON data from multiple live listings given multiple itemIds
'''
headers = {
@ -212,10 +212,11 @@ class ShoppingApi:
for item in future:
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
else:
print('reached call limit')
print('response is None')
break
with open('raw_data.txt', 'w') as f:
json.dump(data, f)
return data
# NOTE:
@ -359,10 +360,7 @@ class CurateData:
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # TODO because var is del after dl_pictures you may be
# getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such
# giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in
# computate power reqs. So, figure out a way to make a true temp list based on the current call executed
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
else:
class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
@ -373,6 +371,8 @@ class CurateData:
expanded_dropd = dropd
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
# retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
try:
@ -387,29 +387,17 @@ class CurateData:
json.dump(temp_pics_source_list, f)
# Append to master training dataframes, drop potential dupes and save
expanded_class.to_csv('expanded_class.csv')
# expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
# expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
expanded_dropd.to_csv('expanded_dropd.csv')
# expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
# expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
return expanded_class, expanded_dropd
def dl_pictures(self, *args):
def dl_pictures(self):
'''
Downloads pictures from api to local storage using temp_pics_source_list
and creates custom {source:target} dictionary as dict_pics
'''
# TODO add option to include only first image of each listing as
# others may be crappy for training. Also consider adding option to
# reduce the size of each pic downloaded
try:
with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
target_dir = json.load(f)
@ -428,45 +416,35 @@ class CurateData:
with open('temp_pics_source_list.txt') as f:
try:
if args:
temp_pics_source_list = args[0]
else:
temp_pics_source_list = json.load(f)
temp_pics_source_list = json.load(f)
except (ValueError, FileNotFoundError):
if args:
temp_pics_sources_list = args[0]
else:
print('url list not found. download aborted')
return
print('url list not found. download aborted')
return
temp_dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
try:
with open('dict_pics.txt') as f:
dict_pics = json.load(f)
dict_pics.update(temp_dict_pics) # TODO This still creates duplicates
with open('dict_pics.txt', 'w') as f:
json.dump(dict_pics, f)
except (ValueError, FileNotFoundError):
with open('dict_pics.txt', 'w') as f:
json.dump(temp_dict_pics, f)
dict_pics = temp_dict_pics
dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
with open('dict_pics.txt', 'w') as f:
json.dump(dict_pics, f)
def dl_pic(dict_pics, pic):
if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
#TODO it would mean that temp_pics_source_list is changing for some reason?
try:
else:
try:
r = requests.get(pic, stream=True)
r.raw.decode_content = True
with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
shutil.copyfileobj(r.raw, f)
except ConnectionError:
return
if os.path.exists(dict_pics[pic]):
pass # TODO should catch dupes, but make sure it is
else:
try:
r = requests.get(pic, stream=True)
r.raw.decode_content = True
with open(dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
shutil.copyfileobj(r.raw, f)
except ConnectionError:
return
except KeyError:
pass
bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
with concurrent.futures.ThreadPoolExecutor() as executor: