fixed extract_contents for expanded_dropd df

2021-04-03 13:09:21 -07:00
parent 167f1f29ec
commit 35100b7952
2 changed files with 26 additions and 31 deletions
--- a/curate.py
+++ b/curate.py
@@ -7,27 +7,12 @@ file used to compile methods from ebay_api.py for curating training data

 curate = ebay_api.CurateData()
 raw_data = curate.import_raw()
-training = curate.to_training(raw_data) # NOTE have to reference PictureURL list here if you want to expand. Other column is string in subsequent dfs
-# or use dropd.PictureURL.split(' ')
+training = curate.to_training(raw_data)
 class_training = curate.class_training(training)
 nvl_training = curate.nvl_training(training)
-dropd = curate.drop_nvl_cols(nvl_training) # NOTE move this method above extracted and this should solve the expand before extract problem
+dropd = curate.drop_nvl_cols(nvl_training)
+
 # expand_nvlclass(class_training, dropd)
-# extracted_df = curate.extract_contents(expended_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd
+# extracted_df = curate.extract_contents(expanded_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd

-def expand_nvlclass(class_training, dropd):
-    '''
-    takes image url list from each cell and expands them into separate/duplicate
-    instances. Modifies both class training and dropd dfs. Appends custom
-    image url dict {'source':'destination'}.
-    '''
-    #interm_s =class_training.PictureURL.apply(lambda x: len(x))
-    #expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True)
-    expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True)
-    # expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/
-    expanded_dropd = dropd.explode('PictureURL').reset_indext(drop=True)
-    #expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation
-    # prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training
-
-    #interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4)
-    return expanded_class_training, expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
+# TODO # need to replace expanded df's PictureURL col values with destination urls
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -201,14 +201,15 @@ class CurateData:
        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
        nvl_df = pd.json_normalize(nvl_dict)
        nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
-        # TODO MAY HAVE TO RUN drop_nvl_cols and extract_contents in here
+
        return nvl_training

    def extract_contents(self, df):
        '''
        converts single-value lists of strings of any df to string if not null
        '''
-        extracted_df = df.applymap(lambda x: ' '.join(x) if np.any(pd.notnull(x)) else np.nan)
+        extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isull(x) else x)
+
        return extracted_df

    def drop_nvl_cols(self, nvl_training):
@@ -239,6 +240,8 @@ class CurateData:
        # closure; toe shape and toe type; occasion and performance/activity;
        # see if you can combine these somehow (you may not want this though).
        # Also consider keeping only cols that have plenty of values
+        # Run some value_count() analysis to determine frequencies and filter
+        # user created item specifics, leaving only predefined ebay item specs

        user_input = input('drop or keep cols?:')
        if 'keep' in user_input:
@@ -249,7 +252,12 @@ class CurateData:

    def combine_nvlclass(self, class_training, dropd):
        final_training = pd.concat([class_training, dropd], axis=1)
-        return final_training # TODO might not need this
+        return final_training # TODO might not need this function
+
+    def make_dict_pics(self, expanded_class_training):
+        with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
+            dict_pics = json.load(jf)
+#        dict_pics.extend('<

    def expand_nvlclass(class_training, dropd):
        '''
@@ -257,17 +265,14 @@ class CurateData:
        instances. Modifies both class training and dropd dfs. Appends custom
        image url dict {'source':'destination'}.
        '''
-        #interm_s =class_training.PictureURL.apply(lambda x: len(x))
-        #expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True)
        expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True)
-        expanded_dropd = dropd.explode('PictureURL').reset_indext(drop=True)
-        #expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation
-        # prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training
+        # expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/
+        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
+        # expanded_dropd.PictureURL.apply(lambda x: 'c:/users/unknown/

-        #interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4)
-        return expanded_class_training, expanded_dropd
+        return expanded_class_training, expanded_dropd, dict_pics# TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}

-    def dl_pictures(self, dict_pic, expand=1):
+    def dl_pictures(self):
        '''
        Downloads pictures from api to local storage using custom master dict
        '''
@@ -280,8 +285,13 @@ class CurateData:
        filename = '<your destination + naming schem.jpg>'
        with open(filename, 'wb') as  f:
            shutil.copyfileobj(r.raw, f)
+
+        # NOTE consider adding this dl_pictures func inside another func that uses 
+        # threading to fund the dl_pictures func here somewhere
+
        # PictureURL in PictureURL list can't be downloaded....have to use indirect address in the form https://i.ebayimg.com/images/g/<unique code>/s-l<size>.jpg
        # in place of https://i.ebayimg.com/00/s/ODQwWDE2MDA=/z/<unique code>/$_1.JPG or use requests methods instead of wget and original PictureURL? yes, use requests
+
        # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
        # if not exists and append to master img download dict