dropped na values @ training df. Used reindex instead of loc to select columns

2021-05-30 15:51:05 -07:00
parent c2a812f7a8
commit 9fc00f4eab
1 changed files with 24 additions and 21 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -41,7 +41,7 @@ class FindingApi:
            "SERVICE-VERSION":"1.13.0",
            "RESPONSE-DATA-FORMAT":"JSON",
            "categoryId":category_id,
-            "paginationInput.entriesPerPage":"10", # TODO change back to max = 100
+            "paginationInput.entriesPerPage":"100",
            "paginationInput.PageNumber":i,
            "itemFilter(0).name":"Condition",
            "itemFilter(0).value":"Used"
@@ -248,7 +248,7 @@ class CurateData:
            raw_data = json.load(f)
            return raw_data

-    def raw_df(self, raw_data):
+    def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
        '''
        creates pandas df from raw json and saves master raw csv file as raw_df.csv.
        Indended to be used inline with direct
@@ -256,14 +256,16 @@ class CurateData:
        '''
        to_json = json.dumps(raw_data)
        raw_df = pd.read_json(to_json)
-        raw_df.to_csv('raw_df.csv', mode='a')
-        raw_df = pd.read_csv('raw_df.csv', index_col=0)
-        raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
-        raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure 
+        raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
+        #raw_df = pd.read_csv('raw_df.csv', index_col=0)
+        #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
+        #raw_df.to_csv('raw_df.csv')
+
+        # TODO still saving "Unnamed:0" column

        return raw_df

-    def to_training(self, raw_data): # NOTE need to create copies not views 
+    def to_training(self, raw_data):
        '''
        creates first pass of potential labels for training set. This is the base
        df used to produce other training sets to use.
@@ -271,14 +273,14 @@ class CurateData:
        raw_df = self.raw_df(raw_data)
        interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
-        training = interm_df1
-        return training # TODO RENAME THIS FUNC AND RETURN VALUE
+        training = interm_df1.dropna(subset=['ItemSpecifics'])
+        return training # TODO RENAME THIS FUNC AND its RETURN VALUE

    def class_training(self, training):
        '''Training set for multiclass portion of training set. Used to train
        seprately from multilabel portion
        '''
-        class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
+        class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] 
        return class_training

    def nvl_training(self, training):
@@ -326,7 +328,7 @@ class CurateData:
                'PictureURL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
                'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
                'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement',
-                'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material',
+                'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material'
                ]
        # May be no difference between Product type and sub style; fastening and
        # closure; toe shape and toe type; occasion and performance/activity;
@@ -336,8 +338,9 @@ class CurateData:
        # user created item specifics, leaving only predefined ebay item specs

        user_input = input('drop or keep cols?:')
+
        if 'keep' in user_input:
-            dropd = nvl_training.loc[:,col_keep]
+            dropd = nvl_training.reindex([col_keep])
        else:
            dropd = nvl_training.drop(col_drop, axis=1)
        return dropd
@@ -364,7 +367,7 @@ class CurateData:
        expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

        temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
-        # defined in the download function
+        #: defined in the download function


        try:
@@ -380,15 +383,15 @@ class CurateData:

        # Append to master training dataframes, drop potential dupes and save

-        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
-        expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
-        expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
+        expanded_class.to_csv('expanded_class.csv')
+        # expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
+        # expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        # expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies

-        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
-        expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
-        expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
+        expanded_dropd.to_csv('expanded_dropd.csv')
+        # expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
+        # expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        # expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')

        return expanded_class, expanded_dropd