added nvl_dict function to create series based on nvl dict of raw data

2021-01-26 23:01:00 -07:00
parent d09090dc13
commit 18871a76d7
1 changed files with 30 additions and 9 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -127,6 +127,30 @@ class CurateData:
    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''

+    def import_raw(self):
+        with open('raw_data.txt') as f:
+            raw_data = json.load(f)
+            return raw_data
+
+    def data_frame(self, data):
+        to_json = json.dumps(data)
+        raw_df = pd.read_json(to_json)
+        return raw_df
+
+    def to_training(self, data):
+        raw_df = self.data_frame(data)
+        interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
+        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str)
+        training = interm_df1
+
+        return training
+
+    def nvl_dict(self, training):
+        interm_df1 = pd.Series(training.ItemSpecifics)
+        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
+        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
+        return nvl_dict
+
    def update_df(self, data):
        '''
        Creates training instances for dataset. picture_url_list expanded to
@@ -135,17 +159,11 @@ class CurateData:
        per listing, each picture will be its own training instance.
        '''

-    def data_frame(self, data):
-        to_json = json.dumps(data)
-        raw_df = pd.read_json(to_json)
-        return raw_df
-
-    def to_training(self):
-        raw_df = self.data_frame(data)
-        interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
-        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str)
+    # USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell 
+    # USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
    # USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though. 

+    # USE pd.merge() at some point...possibly after expanding lists and nvl
    # USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
    # df. then you can take the raw_df and combine it with the old raw_df for backup. 

@@ -170,6 +188,9 @@ if __name__ == "__main__":
 # to divide these up into the categories. This will leave you with about 6.25K results per cat.
 # More than enough data for your dataset.

+# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
+# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 
+
 # TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
 # TO AVOID HICCUPS WHEN CREATING DATASET
 # TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.