diff --git a/ebay_api.py b/ebay_api.py index 11978ce..144cde2 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -127,6 +127,30 @@ class CurateData: value pairs that gets updated to custom dataframe used in Ml training sets. ''' + def import_raw(self): + with open('raw_data.txt') as f: + raw_data = json.load(f) + return raw_data + + def data_frame(self, data): + to_json = json.dumps(data) + raw_df = pd.read_json(to_json) + return raw_df + + def to_training(self, data): + raw_df = self.data_frame(data) + interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] + interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str) + training = interm_df1 + + return training + + def nvl_dict(self, training): + interm_df1 = pd.Series(training.ItemSpecifics) + interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) + nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) + return nvl_dict + def update_df(self, data): ''' Creates training instances for dataset. picture_url_list expanded to @@ -135,17 +159,11 @@ class CurateData: per listing, each picture will be its own training instance. ''' - def data_frame(self, data): - to_json = json.dumps(data) - raw_df = pd.read_json(to_json) - return raw_df - - def to_training(self): - raw_df = self.data_frame(data) - interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] - interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str) + # USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell + # USE training.apply(func, axis= something) to create your custom nvl_dict for each cell # USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though. + # USE pd.merge() at some point...possibly after expanding lists and nvl # USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed # df. then you can take the raw_df and combine it with the old raw_df for backup. @@ -170,6 +188,9 @@ if __name__ == "__main__": # to divide these up into the categories. This will leave you with about 6.25K results per cat. # More than enough data for your dataset. +# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which +# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. + # TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO # TO AVOID HICCUPS WHEN CREATING DATASET # TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.