diff --git a/curate.py b/curate.py index ed31371..22c6db5 100644 --- a/curate.py +++ b/curate.py @@ -12,7 +12,9 @@ class_training = curate.class_training(training) nvl_training = curate.nvl_training(training) dropd = curate.drop_nvl_cols(nvl_training) -# expand_nvlclass(class_training, dropd) -# extracted_df = curate.extract_contents(expanded_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd +expanded_dfs = curate.expand_nvlclass(class_training, dropd) +expanded_class = expanded_dfs[0] +expanded_dropd = expanded_dfs[1] # TODO # need to replace expanded df's PictureURL col values with destination urls +# TODO # still have the problem of duplicate listings. Possibly take care of this before you run curate diff --git a/ebay_api.py b/ebay_api.py index 0d6bddf..c576c29 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -7,6 +7,7 @@ import pandas as pd import config as cfg import wget # NOTE may not need this import shutil +import re class FindingApi: '''Methods for accessing eBays FindingApi services''' @@ -204,11 +205,11 @@ class CurateData: return nvl_training - def extract_contents(self, df): + def extract_df(self, df): ''' converts single-value lists of strings of any df to string if not null ''' - extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isull(x) else x) + extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x) return extracted_df @@ -257,9 +258,10 @@ class CurateData: def make_dict_pics(self, expanded_class_training): with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up dict_pics = json.load(jf) -# dict_pics.extend('< + dict_pics.extend('<') - def expand_nvlclass(class_training, dropd): + + def expand_nvlclass(self, class_training, dropd): ''' takes image url list from each cell and expands them into separate/duplicate instances. Modifies both class training and dropd dfs. Appends custom @@ -268,9 +270,16 @@ class CurateData: expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True) # expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/ expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) + expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values # expanded_dropd.PictureURL.apply(lambda x: 'c:/users/unknown/ - return expanded_class_training, expanded_dropd, dict_pics# TODO still need to replace source url to destination url in df cols and create custom dict {, } + # curate.make_dict_pics(expanded_class_training) # custom image url dict + dict_pics_list = set(expanded_class_training.PictureURL.to_list()) # prolly need to create set long before df... immediately after Shopping or trading call + dict_pics = {k:destination+re.search(r'\w+(?=/\$_)', a).group()+'.jpg' for k in dict_pics_list} # TODO determine how to implement destination variable + + # re.search(r'\w+(?=/\$_)', a).group() + + return expanded_class_training, expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {, } def dl_pictures(self): '''