diff --git a/curate.py b/curate.py index 7972644..241525c 100644 --- a/curate.py +++ b/curate.py @@ -15,7 +15,6 @@ expanded_dfs = curate.expand_nvlclass(class_training, dropd) expanded_class = expanded_dfs[0] expanded_dropd = expanded_dfs[1] -dict_pics = expanded_dfs[2] +# dict_pics = expanded_dfs[2] -# TODO # need to replace expanded df's PictureURL col values with destination urls # TODO # still have the problem of duplicate listings. Possibly take care of this before you run curate diff --git a/ebay_api.py b/ebay_api.py index 3f40a17..b3d6aa0 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -1,11 +1,11 @@ import importlib +import os import numpy as np import concurrent.futures import json import requests import pandas as pd import config as cfg -import wget # NOTE may not need this import shutil import re @@ -125,7 +125,7 @@ class ShoppingApi: try: response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) response.raise_for_status() - except request.exceptions.RequestException: + except requests.exceptions.RequestException: print('connection error') response = response.json() response = response['Item'] @@ -251,14 +251,14 @@ class CurateData: dropd = nvl_training.drop(col_drop, axis=1) return dropd - def combine_nvlclass(self, class_training, dropd): - final_training = pd.concat([class_training, dropd], axis=1) - return final_training # TODO might not need this function - - def make_dict_pics(self, expanded_class_training): - with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up - dict_pics = json.load(jf) - dict_pics.extend('<') +# def combine_nvlclass(self, class_training, dropd): +# final_training = pd.concat([class_training, dropd], axis=1) +# return final_training # TODO might not need this function +# +# def make_dict_pics(self, expanded_class_training): +# with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up +# dict_pics = json.load(jf) +# dict_pics.extend('<') def expand_nvlclass(self, class_training, dropd): @@ -269,54 +269,96 @@ class CurateData: ''' expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object expanded_class = expanded_class.dropna(subset=['PictureURL']) - expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']) - expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) + expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + # expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object expanded_dropd = expanded_dropd.dropna(subset=['PictureURL']) - expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']) - expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) + expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + # expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x]) expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only # defined in the download function - dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list} - expanded_class = expanded_class - # with open('dict_pics.txt','w+') as f: # TODO open if it exists, or write if not, then extend the dictionary with dict_pics +# '''Will use temp_dict_pics for changing the training set at preprocessing''' +# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list} +# # TODO decide if the above is necesssary at this point or if it should +# # be created at preprocessing or download +# +# with open('dict_pics.txt', 'w') as f: +# try: +# dict_pics = json.load(f) +# dict_pics.update(temp_dict_pics) +# json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions +# +# except ValueError: +# json.dump(temp_dict_pics, f) - return expanded_class, expanded_dropd, dict_pics # TODO still need to replace source url to destination url in df cols and create custom dict {, } + with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor + try: + dict_pics_list = json.load(f) + dict_pics_list.append(dict_pics_list) + json.dump(dict_pics_list, f) + + except ValueError: + json.dump(dict_pics_list, f) + + return expanded_class, expanded_dropd def dl_pictures(self): ''' Downloads pictures from api to local storage using custom master dict ''' - with open('dict_pic.txt', 'w+') as jf: # avoid duplicate logic goes here... I think + with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments + try: + target_dir = json.load(f) + except ValueError: + target_dir = input('No default directory found. Create One? [y] or [n]:') + with open('dict_pics.txt') as jf: dict_pics = json.load(jf) - r = requests.get('', stream=True) - r.raw.decode_content = True - filename = '' - with open(filename, 'wb') as f: - shutil.copyfileobj(r.raw, f) + with open('dict_pics_list.txt') as f: + dict_pics_list = json.load(f) - # NOTE consider adding this dl_pictures func inside another func that uses - # threading to fund the dl_pictures func here somewhere + def dl_pic(pic): - # PictureURL in PictureURL list can't be downloaded....have to use indirect address in the form https://i.ebayimg.com/images/g//s-l.jpg - # in place of https://i.ebayimg.com/00/s/ODQwWDE2MDA=/z//$_1.JPG or use requests methods instead of wget and original PictureURL? yes, use requests + if os.path.exists(dict_pics[pic]): + pass + else: + r = requests.get(pic, stream=True) + r.raw.decode_content = True + with open(dict_pics[pic], 'wb') as f: # might not work? + shutil.copyfileobj(r.raw, f) + + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(dl_pic, dict_pics_list): + future + + with open('dict_pics_list.txt','w') as f: + dict_pics_list = [] + json.dump(dict_pics_list, f) + + temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list} + # TODO decide if the above is necesssary at this point or if it should + # be created at preprocessing or download + + with open('dict_pics.txt', 'w') as f: + try: + dict_pics = json.load(f) + dict_pics.update(temp_dict_pics) + json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions + + except ValueError: + json.dump(temp_dict_pics, f) # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures # if not exists and append to master img download dict # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(download_function, master_url_dict): - pass - def update_df(self, data): # TODO save raw df as csv file ''' Creates training instances for dataset. picture_url_list expanded to