34 lines
1.9 KiB
Python
34 lines
1.9 KiB
Python
import ebay_api
|
|
import numpy as np
|
|
|
|
'''
|
|
file used to compile methods from ebay_api.py for curating training data
|
|
'''
|
|
|
|
curate = ebay_api.CurateData()
|
|
raw_data = curate.import_raw()
|
|
training = curate.to_training(raw_data) # NOTE have to reference PictureURL list here if you want to expand. Other column is string in subsequent dfs
|
|
# or use dropd.PictureURL.split(' ')
|
|
class_training = curate.class_training(training)
|
|
nvl_training = curate.nvl_training(training)
|
|
dropd = curate.drop_nvl_cols(nvl_training) # NOTE move this method above extracted and this should solve the expand before extract problem
|
|
# expand_nvlclass(class_training, dropd)
|
|
# extracted_df = curate.extract_contents(expended_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd
|
|
|
|
def expand_nvlclass(class_training, dropd):
|
|
'''
|
|
takes image url list from each cell and expands them into separate/duplicate
|
|
instances. Modifies both class training and dropd dfs. Appends custom
|
|
image url dict {'source':'destination'}.
|
|
'''
|
|
#interm_s =class_training.PictureURL.apply(lambda x: len(x))
|
|
#expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True)
|
|
expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True)
|
|
# expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/
|
|
expanded_dropd = dropd.explode('PictureURL').reset_indext(drop=True)
|
|
#expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation
|
|
# prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training
|
|
|
|
#interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4)
|
|
return expanded_class_training, expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
|