added nvl_dict function to create series based on nvl dict of raw data

This commit is contained in:
spbeach46 2021-01-26 23:01:00 -07:00
parent d09090dc13
commit 18871a76d7

View File

@ -127,6 +127,30 @@ class CurateData:
value pairs that gets updated to custom dataframe used in Ml training sets.
'''
def import_raw(self):
with open('raw_data.txt') as f:
raw_data = json.load(f)
return raw_data
def data_frame(self, data):
to_json = json.dumps(data)
raw_df = pd.read_json(to_json)
return raw_df
def to_training(self, data):
raw_df = self.data_frame(data)
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str)
training = interm_df1
return training
def nvl_dict(self, training):
interm_df1 = pd.Series(training.ItemSpecifics)
interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
return nvl_dict
def update_df(self, data):
'''
Creates training instances for dataset. picture_url_list expanded to
@ -135,17 +159,11 @@ class CurateData:
per listing, each picture will be its own training instance.
'''
def data_frame(self, data):
to_json = json.dumps(data)
raw_df = pd.read_json(to_json)
return raw_df
def to_training(self):
raw_df = self.data_frame(data)
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str)
# USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell
# USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
# USE pd.merge() at some point...possibly after expanding lists and nvl
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
# df. then you can take the raw_df and combine it with the old raw_df for backup.
@ -170,6 +188,9 @@ if __name__ == "__main__":
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.