dl_pictures and curate.py cleanup
This commit is contained in:
parent
e879e80910
commit
ae25ab88b6
@ -12,7 +12,9 @@ class_training = curate.class_training(training) # creates initial class_trainin
|
||||
nvl_training = curate.nvl_training(training) # creates initial nvl_training
|
||||
dropd = curate.drop_nvl_cols(nvl_training) # label mask
|
||||
dropd
|
||||
expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs
|
||||
|
||||
# pulls values out of lists for both dfs and creates temp_pics_source_list.txt
|
||||
expanded_dfs = curate.expand_nvlclass(class_training, dropd)
|
||||
|
||||
expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col
|
||||
expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func. Specifically "reindex" usage
|
||||
@ -20,7 +22,7 @@ expanded_dropd = expanded_dfs[1] # TODO incorrect df. Look at nvl_training func.
|
||||
download = input('download images?: ')
|
||||
if ('y' or 'Y') in download:
|
||||
with open('temp_pics_source_list.txt') as f:
|
||||
test_list = json.load(f)
|
||||
curate.dl_pictures(test_list)
|
||||
url_list = json.load(f)
|
||||
curate.dl_pictures(url_list)
|
||||
else:
|
||||
pass
|
||||
|
82
ebay_api.py
82
ebay_api.py
@ -168,7 +168,7 @@ class ShoppingApi:
|
||||
def get_item_from_findItemsByCategory(self, twenty_id):
|
||||
|
||||
'''
|
||||
Gets raw JSON data from multiple live listings given multiple itemIds
|
||||
Gets raw JSON data from multiple live listings given multiple itemIds
|
||||
'''
|
||||
|
||||
headers = {
|
||||
@ -212,10 +212,11 @@ class ShoppingApi:
|
||||
for item in future:
|
||||
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
|
||||
else:
|
||||
print('reached call limit')
|
||||
print('response is None')
|
||||
break
|
||||
with open('raw_data.txt', 'w') as f:
|
||||
json.dump(data, f)
|
||||
return data
|
||||
|
||||
# NOTE:
|
||||
|
||||
@ -359,10 +360,7 @@ class CurateData:
|
||||
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # TODO because var is del after dl_pictures you may be
|
||||
# getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such
|
||||
# giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in
|
||||
# computate power reqs. So, figure out a way to make a true temp list based on the current call executed
|
||||
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
|
||||
|
||||
else:
|
||||
class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
|
||||
@ -373,6 +371,8 @@ class CurateData:
|
||||
expanded_dropd = dropd
|
||||
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
# retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
|
||||
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
|
||||
|
||||
try:
|
||||
@ -387,29 +387,17 @@ class CurateData:
|
||||
json.dump(temp_pics_source_list, f)
|
||||
|
||||
# Append to master training dataframes, drop potential dupes and save
|
||||
|
||||
expanded_class.to_csv('expanded_class.csv')
|
||||
# expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
|
||||
# expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
|
||||
|
||||
expanded_dropd.to_csv('expanded_dropd.csv')
|
||||
# expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
|
||||
# expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
def dl_pictures(self, *args):
|
||||
def dl_pictures(self):
|
||||
'''
|
||||
Downloads pictures from api to local storage using temp_pics_source_list
|
||||
and creates custom {source:target} dictionary as dict_pics
|
||||
'''
|
||||
|
||||
# TODO add option to include only first image of each listing as
|
||||
# others may be crappy for training. Also consider adding option to
|
||||
# reduce the size of each pic downloaded
|
||||
|
||||
try:
|
||||
with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
|
||||
target_dir = json.load(f)
|
||||
@ -428,45 +416,35 @@ class CurateData:
|
||||
|
||||
with open('temp_pics_source_list.txt') as f:
|
||||
try:
|
||||
if args:
|
||||
temp_pics_source_list = args[0]
|
||||
else:
|
||||
temp_pics_source_list = json.load(f)
|
||||
temp_pics_source_list = json.load(f)
|
||||
except (ValueError, FileNotFoundError):
|
||||
if args:
|
||||
temp_pics_sources_list = args[0]
|
||||
else:
|
||||
print('url list not found. download aborted')
|
||||
return
|
||||
print('url list not found. download aborted')
|
||||
return
|
||||
|
||||
temp_dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
|
||||
|
||||
try:
|
||||
with open('dict_pics.txt') as f:
|
||||
dict_pics = json.load(f)
|
||||
dict_pics.update(temp_dict_pics) # TODO This still creates duplicates
|
||||
with open('dict_pics.txt', 'w') as f:
|
||||
json.dump(dict_pics, f)
|
||||
|
||||
except (ValueError, FileNotFoundError):
|
||||
with open('dict_pics.txt', 'w') as f:
|
||||
json.dump(temp_dict_pics, f)
|
||||
dict_pics = temp_dict_pics
|
||||
|
||||
dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
|
||||
with open('dict_pics.txt', 'w') as f:
|
||||
json.dump(dict_pics, f)
|
||||
|
||||
def dl_pic(dict_pics, pic):
|
||||
|
||||
if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
|
||||
pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
|
||||
#TODO it would mean that temp_pics_source_list is changing for some reason?
|
||||
try:
|
||||
|
||||
else:
|
||||
try:
|
||||
r = requests.get(pic, stream=True)
|
||||
r.raw.decode_content = True
|
||||
with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
except ConnectionError:
|
||||
return
|
||||
if os.path.exists(dict_pics[pic]):
|
||||
pass # TODO should catch dupes, but make sure it is
|
||||
|
||||
else:
|
||||
try:
|
||||
r = requests.get(pic, stream=True)
|
||||
r.raw.decode_content = True
|
||||
with open(dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
except ConnectionError:
|
||||
|
||||
return
|
||||
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
|
Loading…
Reference in New Issue
Block a user