diff --git a/ebay_api.py b/ebay_api.py index b3d6aa0..0423a9d 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -278,12 +278,12 @@ class CurateData: expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values - dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call + pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only # defined in the download function # '''Will use temp_dict_pics for changing the training set at preprocessing''' -# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list} +# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list} # # TODO decide if the above is necesssary at this point or if it should # # be created at preprocessing or download # @@ -296,37 +296,46 @@ class CurateData: # except ValueError: # json.dump(temp_dict_pics, f) - with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor + with open('pics_source_list.txt', 'a+') as f: # Temp iterable for use w/executor try: - dict_pics_list = json.load(f) - dict_pics_list.append(dict_pics_list) - json.dump(dict_pics_list, f) - + pics_source_list = json.load(f) + pics_source_list.append(pics_source_list) + json.dump(pics_source_list, f) except ValueError: - json.dump(dict_pics_list, f) + json.dump(pics_source_list, f) return expanded_class, expanded_dropd def dl_pictures(self): ''' - Downloads pictures from api to local storage using custom master dict + Downloads pictures from api to local storage using pics_source_list + and creates custom {source:target} dictionary as dict_pics ''' with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments try: target_dir = json.load(f) + except ValueError: - target_dir = input('No default directory found. Create One? [y] or [n]:') + target_dir = input('No target directory found. Create One? [y] or [n]:') + if target_dir == 'y': + target_dir = input('Please provide full URL to destination folder') + else: + print('Creating default folder in current directory') + target_dir = os.getcwd() + json.dump(target_dir, f) + with open('dict_pics.txt') as jf: dict_pics = json.load(jf) - with open('dict_pics_list.txt') as f: - dict_pics_list = json.load(f) + with open('pics_source_list.txt') as f: + pics_source_list = json.load(f) def dl_pic(pic): if os.path.exists(dict_pics[pic]): pass + else: r = requests.get(pic, stream=True) r.raw.decode_content = True @@ -334,14 +343,10 @@ class CurateData: shutil.copyfileobj(r.raw, f) with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(dl_pic, dict_pics_list): + for future in executor.map(dl_pic, pics_source_list): future - with open('dict_pics_list.txt','w') as f: - dict_pics_list = [] - json.dump(dict_pics_list, f) - - temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list} + temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in pics_source_list} # TODO decide if the above is necesssary at this point or if it should # be created at preprocessing or download @@ -354,25 +359,15 @@ class CurateData: except ValueError: json.dump(temp_dict_pics, f) + with open('pics_source_list.txt','w') as f: + pics_source_list = [] + json.dump(pics_source_list, f) + # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures # if not exists and append to master img download dict # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted - def update_df(self, data): # TODO save raw df as csv file - ''' - Creates training instances for dataset. picture_url_list expanded to - max available pictures with each picture url corresponding to features - in common with same listing (i.e., because there are multiple pictures - per listing, each picture will be its own training instance. - ''' - pass - - # TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df - # for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and - # the other for updating your training set. - - def main(): ''' Main program creates/updates a csv file to use for ML training from live