diff --git a/ebay_api.py b/ebay_api.py index 144cde2..71cfd27 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -61,7 +61,7 @@ class FindingApi: for future in executor.map(lambda p: self.get_data(*p), args): data = future - try: # TODO if conditions are not working due to each thread checking the same unedited item_id_results list + try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list training = pd.read_csv('training.csv') for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: if (item not in training.values) and (item not in itemid_results_list): @@ -139,18 +139,23 @@ class CurateData: def to_training(self, data): raw_df = self.data_frame(data) - interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] + interm_df1 = raw_df[['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str) training = interm_df1 - return training - def nvl_dict(self, training): + def class_training(self, training): + class_df = training[['PictureURL', 'PrimaryCategoryID']] + return class_training + + def nvl_training(self, training): interm_df1 = pd.Series(training.ItemSpecifics) interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) - return nvl_dict - + nvl_df = pd.json_normalize(nvl_dict) + nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) + return nvl_training +# TODO Still need to to extract strings from list of strings and then drop which ones you don't want or vice versa. You may have to avoid using cells with lists of strings longer than one (e.g., 'Features') def update_df(self, data): ''' Creates training instances for dataset. picture_url_list expanded to @@ -159,11 +164,16 @@ class CurateData: per listing, each picture will be its own training instance. ''' + # Ultimately you need each record to be one picture url as input and relevant columns determined from custom nvl_dicts. You can figure out how you need to address the multiple values in the lists when you make the df just before the final df (this one may include the multiple pictures from each list in the original records. This should be your next step). + + # Considering the above, you need to figure out how to expand the url list while keeping the nvl_df intact + # So, before you can do the above two comments you should first figure out what kind of format you will need your df to be in for training. You require multilabel/multiclass(?)...consult that one article on identifying rainforests, and also hands on machine learning with blah blah. Also consult HPP for combining dfs efficiently. + # USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell # USE training.apply(func, axis= something) to create your custom nvl_dict for each cell # USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though. - # USE pd.merge() at some point...possibly after expanding lists and nvl + # USE pd.merge() at some point...possibly after expanding lists and nvl. consult HPP book for a more efficient way to combine dfs. # USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed # df. then you can take the raw_df and combine it with the old raw_df for backup.