2021-04-02 18:08:56 +00:00
import ebay_api
import numpy as np
'''
file used to compile methods from ebay_api . py for curating training data
'''
curate = ebay_api . CurateData ( )
raw_data = curate . import_raw ( )
training = curate . to_training ( raw_data ) # NOTE have to reference PictureURL list here if you want to expand. Other column is string in subsequent dfs
# or use dropd.PictureURL.split(' ')
class_training = curate . class_training ( training )
nvl_training = curate . nvl_training ( training )
2021-04-03 06:42:31 +00:00
dropd = curate . drop_nvl_cols ( nvl_training ) # NOTE move this method above extracted and this should solve the expand before extract problem
# expand_nvlclass(class_training, dropd)
# extracted_df = curate.extract_contents(expended_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd
2021-04-02 18:08:56 +00:00
def expand_nvlclass ( class_training , dropd ) :
'''
takes image url list from each cell and expands them into separate / duplicate
instances . Modifies both class training and dropd dfs . Appends custom
image url dict { ' source ' : ' destination ' } .
'''
#interm_s =class_training.PictureURL.apply(lambda x: len(x))
#expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True)
expanded_class_training = class_training . explode ( ' PictureURL ' ) . reset_index ( drop = True )
2021-04-03 06:42:31 +00:00
# expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/
expanded_dropd = dropd . explode ( ' PictureURL ' ) . reset_indext ( drop = True )
#expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation
2021-04-02 18:08:56 +00:00
# prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training
#interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4)
2021-04-03 06:42:31 +00:00
return expanded_class_training , expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}