diff --git a/curate.py b/curate.py new file mode 100644 index 0000000..5cff803 --- /dev/null +++ b/curate.py @@ -0,0 +1,29 @@ +import ebay_api +import numpy as np + +''' +file used to compile methods from ebay_api.py for curating training data +''' + +curate = ebay_api.CurateData() +raw_data = curate.import_raw() +training = curate.to_training(raw_data) # NOTE have to reference PictureURL list here if you want to expand. Other column is string in subsequent dfs +# or use dropd.PictureURL.split(' ') +class_training = curate.class_training(training) +nvl_training = curate.nvl_training(training) +extracted_df = curate.extract_contents(nvl_training) +dropd = curate.drop_nvl_cols(extracted_df) + +def expand_nvlclass(class_training, dropd): + ''' + takes image url list from each cell and expands them into separate/duplicate + instances. Modifies both class training and dropd dfs. Appends custom + image url dict {'source':'destination'}. + ''' + #interm_s =class_training.PictureURL.apply(lambda x: len(x)) + #expanded_class_training = class_training.loc[np.repeat(class_training.index.values, interm_s)].reset_index(drop=True) + expanded_class_training = class_training.explode('PictureURL').reset_index(drop=True) + expanded_dropd = dropd.loc[np.repeat(dropd.index.values, interm_s)].reset_index(drop=True) # TODO CHANGE this to use explode(). picture list needs preservation + # prior to creating dropd and extracted. maybe run extraced_df after dropd or after running nvl_training + + #interm_s = interm_s.astype(str).applymap(lambda x: x.split(',')*4) diff --git a/ebay_api.py b/ebay_api.py index f915b0d..54b007d 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -3,9 +3,9 @@ import numpy as np import concurrent.futures import json import requests -from requests.exceptions import Timeout, ConnectionError import pandas as pd import config as cfg +import wget class FindingApi: '''Methods for accessing eBays FindingApi services''' @@ -44,10 +44,11 @@ class FindingApi: # TODO add try excepts here try: response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", - params=params, timeout=1) + params=params, timeout=3) + response.raise_for_status() - except HTTPError: - print('connection error') + except requests.exceptions.RequestException: + print('connection error') #TODO DECIDE HOW TO HANDLE EXCEPTION data = response.json() return data @@ -121,8 +122,9 @@ class ShoppingApi: # TODO Add try excepts here try: response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1) - except HTTPError: - print('connection error') #TODO figure out how to pick up where left off or loop until it works + response.raise_for_status() + except request.exceptions.RequestException: + print('connection error') response = response.json() response = response['Item'] return response @@ -141,9 +143,12 @@ class ShoppingApi: for item in future: data.append(item) # The end result should be a list of dicts where each dict in the list is a listing # data.update(future) + # TODO save data here. You'll use this with your curate data class. SAve this as text file return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list. + # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. # TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. + class CurateData: ''' Contains functions for curating data for machine learning training sets; @@ -152,35 +157,56 @@ class CurateData: ''' def import_raw(self): + ''' + imports raw response json from local file + ''' with open('raw_data.txt') as f: raw_data = json.load(f) return raw_data - def data_frame(self, data): - to_json = json.dumps(data) + def raw_df(self, raw_data): + ''' + creates pandas df from raw json. Indended to be used inline with direct + data stream from ebay's APIs + ''' + to_json = json.dumps(raw_data) raw_df = pd.read_json(to_json) return raw_df - def to_training(self, data): - raw_df = self.data_frame(data) + def to_training(self, raw_data): # NOTE need to create copies not views + ''' + creates first pass of potential labels for training set. This is the base + df used to produce other training sets to use. + ''' + raw_df = self.raw_df(raw_data) interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str) training = interm_df1 - return training + return training # TODO RENAME THIS FUNC AND RETURN VALUE def class_training(self, training): + '''Training set for multiclass portion of training set. Used to train + seprately from multilabel portion + ''' class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] return class_training def nvl_training(self, training): + ''' + Training set for multilabel portion + ''' interm_df1 = pd.Series(training.ItemSpecifics) interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) nvl_df = pd.json_normalize(nvl_dict) nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) + # TODO MAY HAVE TO RUN drop_nvl_cols and extract_contents in here return nvl_training def extract_contents(self, df): + ''' + converts single-value lists of strings of any df to string if not null + ''' extracted_df = df.applymap(lambda x: ' '.join(x) if np.any(pd.notnull(x)) else np.nan) return extracted_df @@ -197,7 +223,7 @@ class CurateData: 'Calf Circumference', 'Handmade', 'Safety Standards', 'Customised', 'Cleat Type', 'Cushioning Level', 'AU Shoe Size', 'Country/Region of Manufacture', 'Type of Sport', 'Main Colour', - 'Look', 'Sole Type', 'Sole Manufacturer Colour', 'Sole Material', + 'Look', 'Sole Type', 'Manufacturer Colour', 'Sole Material', 'Toe Material', 'Feature', 'Length', 'Width', 'Size Chart', 'Boot Height', 'Water Resistance Level', 'Material Composition', 'Calf Width', 'Insole Material', 'UPC', 'Size Type' @@ -215,27 +241,38 @@ class CurateData: user_input = input('drop or keep cols?:') if 'keep' in user_input: - dropd_nvl_training = nvl_training.loc[:,col_keep] + dropd = nvl_training.loc[:,col_keep] else: - dropd_nvl_training = nvl_training.drop(col_drop, axis=1) - return dropd_nvl_training + dropd = nvl_training.drop(col_drop, axis=1) + return dropd - def combine_nvlclass(self, class_training, dropd_nvl_training): - final_training = pd.concat([class_training, dropd_nvl_training], axis=1) - return final_training + def combine_nvlclass(self, class_training, dropd): + final_training = pd.concat([class_training, dropd], axis=1) + return final_training # TODO might not need this + + def expand_nvlclass(class_training, dropd): + ''' + takes image url list from each cell and expands them into separate/duplicate + instances. Modifies both class training and dropd dfs. Appends custom + image url dict {'source':'destination'}. + ''' + + pass def dl_pictures(self, expand=1): ''' - Downloads pictures from api to local storage and expands url list - to user specified number + Downloads pictures from api to local storage using custom master dict ''' - pass + # TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv + # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures + # if not exists and append to master img download dict + # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted -# TODO Still need to to extract strings from list of strings and then drop which ones you don't want or vice versa. You may have to avoid using cells with lists of strings longer than one (e.g., 'Features') -# TODO Also need to expand photo list from PictureURL. Decide how many or which photos to use. You may even want to use a pretrained model to decide whether or not the photos are of shoes or not to filter# -# it might be that only the first picture is reliable enough to use in the dataset. -# TODO also need to decide which features are going to be relevant. For example, is color really necessary for finding features? is it necessary to train your model on this or can you find color an easier way? - def update_df(self, data): + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(download_function, master_url_dict): + pass + + def update_df(self, data): # TODO save raw df as csv file ''' Creates training instances for dataset. picture_url_list expanded to max available pictures with each picture url corresponding to features @@ -244,19 +281,6 @@ class CurateData: ''' pass - # Ultimately you need each record to be one picture url as input and relevant columns determined from custom nvl_dicts. You can figure out how you need to address the multiple values in the lists when you make the df just before the final df (this one may include the multiple pictures from each list in the original records. This should be your next step). - - # Considering the above, you need to figure out how to expand the url list while keeping the nvl_df intact - # So, before you can do the above two comments you should first figure out what kind of format you will need your df to be in for training. You require multilabel/multiclass(?)...consult that one article on identifying rainforests, and also hands on machine learning with blah blah. Also consult HPP for combining dfs efficiently. - - # USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell - # USE training.apply(func, axis= something) to create your custom nvl_dict for each cell - # USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though. - - # USE pd.merge() at some point...possibly after expanding lists and nvl. consult HPP book for a more efficient way to combine dfs. - # USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed - # df. then you can take the raw_df and combine it with the old raw_df for backup. - # TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df # for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and # the other for updating your training set.