added nvl_dict function to create series based on nvl dict of raw data
This commit is contained in:
parent
d09090dc13
commit
18871a76d7
39
ebay_api.py
39
ebay_api.py
@ -127,6 +127,30 @@ class CurateData:
|
||||
value pairs that gets updated to custom dataframe used in Ml training sets.
|
||||
'''
|
||||
|
||||
def import_raw(self):
|
||||
with open('raw_data.txt') as f:
|
||||
raw_data = json.load(f)
|
||||
return raw_data
|
||||
|
||||
def data_frame(self, data):
|
||||
to_json = json.dumps(data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df
|
||||
|
||||
def to_training(self, data):
|
||||
raw_df = self.data_frame(data)
|
||||
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
||||
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str)
|
||||
training = interm_df1
|
||||
|
||||
return training
|
||||
|
||||
def nvl_dict(self, training):
|
||||
interm_df1 = pd.Series(training.ItemSpecifics)
|
||||
interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
|
||||
nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
|
||||
return nvl_dict
|
||||
|
||||
def update_df(self, data):
|
||||
'''
|
||||
Creates training instances for dataset. picture_url_list expanded to
|
||||
@ -135,17 +159,11 @@ class CurateData:
|
||||
per listing, each picture will be its own training instance.
|
||||
'''
|
||||
|
||||
def data_frame(self, data):
|
||||
to_json = json.dumps(data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df
|
||||
|
||||
def to_training(self):
|
||||
raw_df = self.data_frame(data)
|
||||
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
||||
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str)
|
||||
# USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell
|
||||
# USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
|
||||
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
|
||||
|
||||
# USE pd.merge() at some point...possibly after expanding lists and nvl
|
||||
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
|
||||
# df. then you can take the raw_df and combine it with the old raw_df for backup.
|
||||
|
||||
@ -170,6 +188,9 @@ if __name__ == "__main__":
|
||||
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
||||
# More than enough data for your dataset.
|
||||
|
||||
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
|
||||
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
|
||||
|
||||
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
|
||||
# TO AVOID HICCUPS WHEN CREATING DATASET
|
||||
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
|
||||
|
Loading…
Reference in New Issue
Block a user