diff --git a/ebay_api.py b/ebay_api.py index ab3e87d..7fdb368 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -41,7 +41,7 @@ class FindingApi: "SERVICE-VERSION":"1.13.0", "RESPONSE-DATA-FORMAT":"JSON", "categoryId":category_id, - "paginationInput.entriesPerPage":"10", # TODO change back to max = 100 + "paginationInput.entriesPerPage":"100", "paginationInput.PageNumber":i, "itemFilter(0).name":"Condition", "itemFilter(0).value":"Used" @@ -248,7 +248,7 @@ class CurateData: raw_data = json.load(f) return raw_data - def raw_df(self, raw_data): + def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason ''' creates pandas df from raw json and saves master raw csv file as raw_df.csv. Indended to be used inline with direct @@ -256,14 +256,16 @@ class CurateData: ''' to_json = json.dumps(raw_data) raw_df = pd.read_json(to_json) - raw_df.to_csv('raw_df.csv', mode='a') - raw_df = pd.read_csv('raw_df.csv', index_col=0) - raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes) - raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure + raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file + #raw_df = pd.read_csv('raw_df.csv', index_col=0) + #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this + #raw_df.to_csv('raw_df.csv') + + # TODO still saving "Unnamed:0" column return raw_df - def to_training(self, raw_data): # NOTE need to create copies not views + def to_training(self, raw_data): ''' creates first pass of potential labels for training set. This is the base df used to produce other training sets to use. @@ -271,14 +273,14 @@ class CurateData: raw_df = self.raw_df(raw_data) interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str) - training = interm_df1 - return training # TODO RENAME THIS FUNC AND RETURN VALUE + training = interm_df1.dropna(subset=['ItemSpecifics']) + return training # TODO RENAME THIS FUNC AND its RETURN VALUE def class_training(self, training): '''Training set for multiclass portion of training set. Used to train seprately from multilabel portion ''' - class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] + class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] return class_training def nvl_training(self, training): @@ -326,7 +328,7 @@ class CurateData: 'PictureURL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity', 'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity', 'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement', - 'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material', + 'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material' ] # May be no difference between Product type and sub style; fastening and # closure; toe shape and toe type; occasion and performance/activity; @@ -336,8 +338,9 @@ class CurateData: # user created item specifics, leaving only predefined ebay item specs user_input = input('drop or keep cols?:') + if 'keep' in user_input: - dropd = nvl_training.loc[:,col_keep] + dropd = nvl_training.reindex([col_keep]) else: dropd = nvl_training.drop(col_drop, axis=1) return dropd @@ -364,7 +367,7 @@ class CurateData: expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call - # defined in the download function + #: defined in the download function try: @@ -380,15 +383,15 @@ class CurateData: # Append to master training dataframes, drop potential dupes and save - expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') - expanded_class = pd.read_csv('expanded_class.csv', index_col=0) - expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies + expanded_class.to_csv('expanded_class.csv') + # expanded_class = pd.read_csv('expanded_class.csv', index_col=0) + # expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + # expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies - expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') - expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0) - expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') + expanded_dropd.to_csv('expanded_dropd.csv') + # expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0) + # expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + # expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8') return expanded_class, expanded_dropd