mostly finished dl_pictures(). Need testing
This commit is contained in:
parent
fb327a9375
commit
34a6451400
59
ebay_api.py
59
ebay_api.py
@ -278,12 +278,12 @@ class CurateData:
|
||||
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
|
||||
pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
|
||||
destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only
|
||||
# defined in the download function
|
||||
|
||||
# '''Will use temp_dict_pics for changing the training set at preprocessing'''
|
||||
# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
|
||||
# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
|
||||
# # TODO decide if the above is necesssary at this point or if it should
|
||||
# # be created at preprocessing or download
|
||||
#
|
||||
@ -296,37 +296,46 @@ class CurateData:
|
||||
# except ValueError:
|
||||
# json.dump(temp_dict_pics, f)
|
||||
|
||||
with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor
|
||||
with open('pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor
|
||||
try:
|
||||
dict_pics_list = json.load(f)
|
||||
dict_pics_list.append(dict_pics_list)
|
||||
json.dump(dict_pics_list, f)
|
||||
|
||||
pics_source_list = json.load(f)
|
||||
pics_source_list.append(pics_source_list)
|
||||
json.dump(pics_source_list, f)
|
||||
except ValueError:
|
||||
json.dump(dict_pics_list, f)
|
||||
json.dump(pics_source_list, f)
|
||||
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
def dl_pictures(self):
|
||||
'''
|
||||
Downloads pictures from api to local storage using custom master dict
|
||||
Downloads pictures from api to local storage using pics_source_list
|
||||
and creates custom {source:target} dictionary as dict_pics
|
||||
'''
|
||||
|
||||
with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
|
||||
try:
|
||||
target_dir = json.load(f)
|
||||
|
||||
except ValueError:
|
||||
target_dir = input('No default directory found. Create One? [y] or [n]:')
|
||||
target_dir = input('No target directory found. Create One? [y] or [n]:')
|
||||
if target_dir == 'y':
|
||||
target_dir = input('Please provide full URL to destination folder')
|
||||
else:
|
||||
print('Creating default folder in current directory')
|
||||
target_dir = os.getcwd()
|
||||
json.dump(target_dir, f)
|
||||
|
||||
with open('dict_pics.txt') as jf:
|
||||
dict_pics = json.load(jf)
|
||||
|
||||
with open('dict_pics_list.txt') as f:
|
||||
dict_pics_list = json.load(f)
|
||||
with open('pics_source_list.txt') as f:
|
||||
pics_source_list = json.load(f)
|
||||
|
||||
def dl_pic(pic):
|
||||
|
||||
if os.path.exists(dict_pics[pic]):
|
||||
pass
|
||||
|
||||
else:
|
||||
r = requests.get(pic, stream=True)
|
||||
r.raw.decode_content = True
|
||||
@ -334,14 +343,10 @@ class CurateData:
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(dl_pic, dict_pics_list):
|
||||
for future in executor.map(dl_pic, pics_source_list):
|
||||
future
|
||||
|
||||
with open('dict_pics_list.txt','w') as f:
|
||||
dict_pics_list = []
|
||||
json.dump(dict_pics_list, f)
|
||||
|
||||
temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
|
||||
temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list}
|
||||
# TODO decide if the above is necesssary at this point or if it should
|
||||
# be created at preprocessing or download
|
||||
|
||||
@ -354,25 +359,15 @@ class CurateData:
|
||||
except ValueError:
|
||||
json.dump(temp_dict_pics, f)
|
||||
|
||||
with open('pics_source_list.txt','w') as f:
|
||||
pics_source_list = []
|
||||
json.dump(pics_source_list, f)
|
||||
|
||||
# TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
|
||||
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
|
||||
# if not exists and append to master img download dict
|
||||
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
|
||||
|
||||
def update_df(self, data): # TODO save raw df as csv file
|
||||
'''
|
||||
Creates training instances for dataset. picture_url_list expanded to
|
||||
max available pictures with each picture url corresponding to features
|
||||
in common with same listing (i.e., because there are multiple pictures
|
||||
per listing, each picture will be its own training instance.
|
||||
'''
|
||||
pass
|
||||
|
||||
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
|
||||
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
|
||||
# the other for updating your training set.
|
||||
|
||||
|
||||
def main():
|
||||
'''
|
||||
Main program creates/updates a csv file to use for ML training from live
|
||||
|
Loading…
Reference in New Issue
Block a user