mostly finished dl_pictures(). Need testing

This commit is contained in:
spbeach46 2021-04-13 10:10:24 -07:00
parent fb327a9375
commit 34a6451400

View File

@ -278,12 +278,12 @@ class CurateData:
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only
# defined in the download function
# '''Will use temp_dict_pics for changing the training set at preprocessing'''
# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
# # TODO decide if the above is necesssary at this point or if it should
# # be created at preprocessing or download
#
@ -296,37 +296,46 @@ class CurateData:
# except ValueError:
# json.dump(temp_dict_pics, f)
with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor
with open('pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor
try:
dict_pics_list = json.load(f)
dict_pics_list.append(dict_pics_list)
json.dump(dict_pics_list, f)
pics_source_list = json.load(f)
pics_source_list.append(pics_source_list)
json.dump(pics_source_list, f)
except ValueError:
json.dump(dict_pics_list, f)
json.dump(pics_source_list, f)
return expanded_class, expanded_dropd
def dl_pictures(self):
'''
Downloads pictures from api to local storage using custom master dict
Downloads pictures from api to local storage using pics_source_list
and creates custom {source:target} dictionary as dict_pics
'''
with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
try:
target_dir = json.load(f)
except ValueError:
target_dir = input('No default directory found. Create One? [y] or [n]:')
target_dir = input('No target directory found. Create One? [y] or [n]:')
if target_dir == 'y':
target_dir = input('Please provide full URL to destination folder')
else:
print('Creating default folder in current directory')
target_dir = os.getcwd()
json.dump(target_dir, f)
with open('dict_pics.txt') as jf:
dict_pics = json.load(jf)
with open('dict_pics_list.txt') as f:
dict_pics_list = json.load(f)
with open('pics_source_list.txt') as f:
pics_source_list = json.load(f)
def dl_pic(pic):
if os.path.exists(dict_pics[pic]):
pass
else:
r = requests.get(pic, stream=True)
r.raw.decode_content = True
@ -334,14 +343,10 @@ class CurateData:
shutil.copyfileobj(r.raw, f)
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(dl_pic, dict_pics_list):
for future in executor.map(dl_pic, pics_source_list):
future
with open('dict_pics_list.txt','w') as f:
dict_pics_list = []
json.dump(dict_pics_list, f)
temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
# TODO decide if the above is necesssary at this point or if it should
# be created at preprocessing or download
@ -354,25 +359,15 @@ class CurateData:
except ValueError:
json.dump(temp_dict_pics, f)
with open('pics_source_list.txt','w') as f:
pics_source_list = []
json.dump(pics_source_list, f)
# TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
# if not exists and append to master img download dict
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
def update_df(self, data): # TODO save raw df as csv file
'''
Creates training instances for dataset. picture_url_list expanded to
max available pictures with each picture url corresponding to features
in common with same listing (i.e., because there are multiple pictures
per listing, each picture will be its own training instance.
'''
pass
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
# the other for updating your training set.
def main():
'''
Main program creates/updates a csv file to use for ML training from live