dropped na values @ training df. Used reindex instead of loc to select columns

This commit is contained in:
spbeach46 2021-05-30 15:51:05 -07:00
parent c2a812f7a8
commit 9fc00f4eab

View File

@ -41,7 +41,7 @@ class FindingApi:
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"10", # TODO change back to max = 100
"paginationInput.entriesPerPage":"100",
"paginationInput.PageNumber":i,
"itemFilter(0).name":"Condition",
"itemFilter(0).value":"Used"
@ -248,7 +248,7 @@ class CurateData:
raw_data = json.load(f)
return raw_data
def raw_df(self, raw_data):
def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
'''
creates pandas df from raw json and saves master raw csv file as raw_df.csv.
Indended to be used inline with direct
@ -256,14 +256,16 @@ class CurateData:
'''
to_json = json.dumps(raw_data)
raw_df = pd.read_json(to_json)
raw_df.to_csv('raw_df.csv', mode='a')
raw_df = pd.read_csv('raw_df.csv', index_col=0)
raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure
raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
#raw_df = pd.read_csv('raw_df.csv', index_col=0)
#raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
#raw_df.to_csv('raw_df.csv')
# TODO still saving "Unnamed:0" column
return raw_df
def to_training(self, raw_data): # NOTE need to create copies not views
def to_training(self, raw_data):
'''
creates first pass of potential labels for training set. This is the base
df used to produce other training sets to use.
@ -271,14 +273,14 @@ class CurateData:
raw_df = self.raw_df(raw_data)
interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
training = interm_df1
return training # TODO RENAME THIS FUNC AND RETURN VALUE
training = interm_df1.dropna(subset=['ItemSpecifics'])
return training # TODO RENAME THIS FUNC AND its RETURN VALUE
def class_training(self, training):
'''Training set for multiclass portion of training set. Used to train
seprately from multilabel portion
'''
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
return class_training
def nvl_training(self, training):
@ -326,7 +328,7 @@ class CurateData:
'PictureURL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material'
]
# May be no difference between Product type and sub style; fastening and
# closure; toe shape and toe type; occasion and performance/activity;
@ -336,8 +338,9 @@ class CurateData:
# user created item specifics, leaving only predefined ebay item specs
user_input = input('drop or keep cols?:')
if 'keep' in user_input:
dropd = nvl_training.loc[:,col_keep]
dropd = nvl_training.reindex([col_keep])
else:
dropd = nvl_training.drop(col_drop, axis=1)
return dropd
@ -364,7 +367,7 @@ class CurateData:
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
# defined in the download function
#: defined in the download function
try:
@ -380,15 +383,15 @@ class CurateData:
# Append to master training dataframes, drop potential dupes and save
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
expanded_class.to_csv('expanded_class.csv')
# expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
# expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
expanded_dropd.to_csv('expanded_dropd.csv')
# expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
# expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
return expanded_class, expanded_dropd