dropped na values @ training df. Used reindex instead of loc to select columns
This commit is contained in:
parent
c2a812f7a8
commit
9fc00f4eab
45
ebay_api.py
45
ebay_api.py
@ -41,7 +41,7 @@ class FindingApi:
|
||||
"SERVICE-VERSION":"1.13.0",
|
||||
"RESPONSE-DATA-FORMAT":"JSON",
|
||||
"categoryId":category_id,
|
||||
"paginationInput.entriesPerPage":"10", # TODO change back to max = 100
|
||||
"paginationInput.entriesPerPage":"100",
|
||||
"paginationInput.PageNumber":i,
|
||||
"itemFilter(0).name":"Condition",
|
||||
"itemFilter(0).value":"Used"
|
||||
@ -248,7 +248,7 @@ class CurateData:
|
||||
raw_data = json.load(f)
|
||||
return raw_data
|
||||
|
||||
def raw_df(self, raw_data):
|
||||
def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
|
||||
'''
|
||||
creates pandas df from raw json and saves master raw csv file as raw_df.csv.
|
||||
Indended to be used inline with direct
|
||||
@ -256,14 +256,16 @@ class CurateData:
|
||||
'''
|
||||
to_json = json.dumps(raw_data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
raw_df.to_csv('raw_df.csv', mode='a')
|
||||
raw_df = pd.read_csv('raw_df.csv', index_col=0)
|
||||
raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
|
||||
raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure
|
||||
raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
|
||||
#raw_df = pd.read_csv('raw_df.csv', index_col=0)
|
||||
#raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
|
||||
#raw_df.to_csv('raw_df.csv')
|
||||
|
||||
# TODO still saving "Unnamed:0" column
|
||||
|
||||
return raw_df
|
||||
|
||||
def to_training(self, raw_data): # NOTE need to create copies not views
|
||||
def to_training(self, raw_data):
|
||||
'''
|
||||
creates first pass of potential labels for training set. This is the base
|
||||
df used to produce other training sets to use.
|
||||
@ -271,14 +273,14 @@ class CurateData:
|
||||
raw_df = self.raw_df(raw_data)
|
||||
interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
||||
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
|
||||
training = interm_df1
|
||||
return training # TODO RENAME THIS FUNC AND RETURN VALUE
|
||||
training = interm_df1.dropna(subset=['ItemSpecifics'])
|
||||
return training # TODO RENAME THIS FUNC AND its RETURN VALUE
|
||||
|
||||
def class_training(self, training):
|
||||
'''Training set for multiclass portion of training set. Used to train
|
||||
seprately from multilabel portion
|
||||
'''
|
||||
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
|
||||
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
|
||||
return class_training
|
||||
|
||||
def nvl_training(self, training):
|
||||
@ -326,7 +328,7 @@ class CurateData:
|
||||
'PictureURL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
|
||||
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
|
||||
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement',
|
||||
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material',
|
||||
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material'
|
||||
]
|
||||
# May be no difference between Product type and sub style; fastening and
|
||||
# closure; toe shape and toe type; occasion and performance/activity;
|
||||
@ -336,8 +338,9 @@ class CurateData:
|
||||
# user created item specifics, leaving only predefined ebay item specs
|
||||
|
||||
user_input = input('drop or keep cols?:')
|
||||
|
||||
if 'keep' in user_input:
|
||||
dropd = nvl_training.loc[:,col_keep]
|
||||
dropd = nvl_training.reindex([col_keep])
|
||||
else:
|
||||
dropd = nvl_training.drop(col_drop, axis=1)
|
||||
return dropd
|
||||
@ -364,7 +367,7 @@ class CurateData:
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
|
||||
# defined in the download function
|
||||
#: defined in the download function
|
||||
|
||||
|
||||
try:
|
||||
@ -380,15 +383,15 @@ class CurateData:
|
||||
|
||||
# Append to master training dataframes, drop potential dupes and save
|
||||
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
|
||||
expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
|
||||
expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
|
||||
expanded_class.to_csv('expanded_class.csv')
|
||||
# expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
|
||||
# expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
|
||||
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
|
||||
expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
expanded_dropd.to_csv('expanded_dropd.csv')
|
||||
# expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
|
||||
# expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
|
||||
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user