dropping cols from nvl_df and combining class_training with nvl_training

This commit is contained in:
spbeach46 2021-02-07 12:25:37 -07:00
parent 1087027812
commit 855e65af80

View File

@ -160,7 +160,7 @@ class CurateData:
return training
def class_training(self, training):
class_df = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
return class_training
def nvl_training(self, training):
@ -170,6 +170,46 @@ class CurateData:
nvl_df = pd.json_normalize(nvl_dict)
nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
return nvl_training
def drop_nvl_cols(self, nvl_training):
col_drop = [
'Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item',
'Modification Description', 'Article Type', 'Customized',
'Character', 'Features', 'Colors', 'Shade', 'Product ID',
'Personalized', 'Platform Height', 'Year Manufactured',
'Trim Material', 'Fashion Element', 'Shaft Material',
'Character Family', 'Heel to Toe Drop', 'Custom Bundle',
'California Prop 65 Warning', 'Manufacturer Color', 'Main Color',
'Collection', 'Midsole Type', 'Signed', 'US Shoe Size (Men#!#s)',
'Calf Circumference', 'Handmade', 'Safety Standards',
'Customised', 'Cleat Type', 'Cushioning Level', 'AU Shoe Size',
'Country/Region of Manufacture', 'Type of Sport', 'Main Colour',
'Look', 'Sole Type', 'Sole Manufacturer Colour', 'Sole Material',
'Toe Material', 'Feature', 'Length', 'Width', 'Size Chart',
'Boot Height', 'Water Resistance Level', 'Material Composition',
'Calf Width', 'Insole Material', 'UPC', 'Size Type'
]
col_keep = [
'Picture URL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material',
]
# May be no difference between Product type and sub style; fastening and
# closure; toe shape and toe type; occasion and performance/activity;
# see if you can combine these somehow (you may not want this though).
# Also consider keeping only cols that have plenty of values
user_input = input('drop or keep cols?:')
if 'keep' in user_input:
dropd_nvl_training = nvl_training.loc[:,[col_keep]]
else:
dropd_nvl_training = nvl_training.drop(col_drop, axis=1)
return dropd_nvl_training
def combine_nvlclass(self):
final_training = pd.concat([class_training, dropd_nvl_trainig], axis=1)
return final_training
# TODO Still need to to extract strings from list of strings and then drop which ones you don't want or vice versa. You may have to avoid using cells with lists of strings longer than one (e.g., 'Features')
# TODO Also need to expand photo list from PictureURL. Decide how many or which photos to use. You may even want to use a pretrained model to decide whether or not the photos are of shoes or not to filter#
# it might be that only the first picture is reliable enough to use in the dataset.
@ -223,3 +263,23 @@ if __name__ == "__main__":
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
'''
List of columns from nvl_list that I want to drop before training:
['Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item', 'Modification Description', 'Article Type', 'Customized', 'Character', 'Features', 'Colors', 'Shade', 'Product ID', 'Personlized', 'Platform Height',
'Year Manufactured', 'Trim Material', 'Fashion Element', 'Shaft Material', 'Character Family', 'Heel to Toe Drop', 'Custom Bundle', 'Califormnia Prop 65 Warning', 'Manufacture Color', 'Main Color', A
'Collection', 'Mid Sole Type', 'Signed', 'US Shoe Size (Men#!#s)', 'Calf Circumference', 'Hand Made', 'Safety Standards', 'Customised', 'Cleat Type', 'Cushioning Level', 'AU Shoe Size', 'Country/Region of Manufacture',
'Type of Sport', 'Main Colour', 'Look']
'''
'''
list of columns from nvl_list that I want to keep before training:
[
'Picture URL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Closure Type', 'Departement',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Material', 'Upper Material',
]
'''
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)