diff --git a/curate.py b/curate.py index 5cff803..655dbd4 100644 --- a/curate.py +++ b/curate.py @@ -11,8 +11,9 @@ training = curate.to_training(raw_data) # NOTE have to reference PictureURL list # or use dropd.PictureURL.split(' ') class_training = curate.class_training(training) nvl_training = curate.nvl_training(training) -extracted_df = curate.extract_contents(nvl_training) -dropd = curate.drop_nvl_cols(extracted_df) +dropd = curate.drop_nvl_cols(nvl_training) # NOTE move this method above extracted and this should solve the expand before extract problem +# expand_nvlclass(class_training, dropd) +# extracted_df = curate.extract_contents(expended_dropd) # only extract contents after running expand_nvlclass and returning expanded dropd def expand_nvlclass(class_training, dropd): ''' @@ -23,7 +24,10 @@ def expand_nvlclass(class_training, dropd): #interm_s =class_training.PictureURL.apply(lambda x: len(x)) #expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True) expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True) - expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation + # expanded_class_training.PictureURL.apply(lambda x: 'c:/users/unknown/ + expanded_dropd = dropd.explode('PictureURL').reset_indext(drop=True) + #expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation # prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training #interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4) + return expanded_class_training, expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {, } diff --git a/ebay_api.py b/ebay_api.py index 54b007d..9b6fac5 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -5,7 +5,8 @@ import json import requests import pandas as pd import config as cfg -import wget +import wget # NOTE may not need this +import shutil class FindingApi: '''Methods for accessing eBays FindingApi services''' @@ -256,13 +257,31 @@ class CurateData: instances. Modifies both class training and dropd dfs. Appends custom image url dict {'source':'destination'}. ''' + #interm_s =class_training.PictureURL.apply(lambda x: len(x)) + #expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True) + expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True) + expanded_dropd = dropd.explode('PictureURL').reset_indext(drop=True) + #expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation + # prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training - pass + #interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4) + return expanded_class_training, expanded_dropd - def dl_pictures(self, expand=1): + def dl_pictures(self, dict_pic, expand=1): ''' Downloads pictures from api to local storage using custom master dict ''' + + with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up + dict_pics = json.load(jf) + + r = requests.get('', stream=True) + r.raw.decode_content = True + filename = '' + with open(filename, 'wb') as f: + shutil.copyfileobj(r.raw, f) + # PictureURL in PictureURL list can't be downloaded....have to use indirect address in the form https://i.ebayimg.com/images/g//s-l.jpg + # in place of https://i.ebayimg.com/00/s/ODQwWDE2MDA=/z//$_1.JPG or use requests methods instead of wget and original PictureURL? yes, use requests # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures # if not exists and append to master img download dict