From 19723c0ea59b1a86f0a358793f21d2bea4c8a44f Mon Sep 17 00:00:00 2001 From: scott Date: Wed, 12 Jan 2022 22:25:40 -0700 Subject: [PATCH] fixed oauth renewal errors. added functionality for retrieving list of mens and womens cats separately --- :w | 595 +++++++++++++++++++++++++++++++++ Shoe Classifier_Xception.ipynb | 182 +++++----- ebay_api.py | 81 +++-- image_faults.py | 2 +- shopping.py | 7 - update_dataset.py | 13 + 6 files changed, 759 insertions(+), 121 deletions(-) create mode 100644 :w delete mode 100644 shopping.py create mode 100644 update_dataset.py diff --git a/:w b/:w new file mode 100644 index 0000000..efc95f6 --- /dev/null +++ b/:w @@ -0,0 +1,595 @@ +import os +from time import sleep +from random import randint +import scrape_ids +from datetime import datetime, timedelta +import dateutil +from dateutil import parser +import pytz +import pdb +from io import StringIO +import numpy as np +import concurrent.futures +import json +import requests +import pandas as pd +import config as cfg +import shutil +import re +import urllib, base64 + +from ebaysdk.exception import ConnectionError +from ebaysdk.trading import Connection as Trading +from ebaysdk.finding import Connection as Finding +from ebaysdk.shopping import Connection as Shopping + +# renew oauth token for shopping api +def getAuthToken(): + AppSettings = { + 'client_id': cfg.oauth["client_id"], + 'client_secret':cfg.oauth["client_secret"], + 'ruName':cfg.oauth["RuName"] + } + + authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret'] + encodedAuthHeader = base64.b64encode(str.encode(authHeaderData)) + encodedAuthHeader = str(encodedAuthHeader)[2:len(str(encodedAuthHeader))-1] + + headers = { + "Content-Type" : "application/x-www-form-urlencoded", # what is this? + "Authorization" : "Basic " + str(encodedAuthHeader) + } + + body= { + "grant_type" : "client_credentials", + "redirect_uri" : AppSettings['ruName'], + "scope" : "https://api.ebay.com/oauth/api_scope" + } + + data = urllib.parse.urlencode(body) + + tokenURL = "https://api.ebay.com/identity/v1/oauth2/token" + + response = requests.post(tokenURL, headers=headers, data=data).json() +# error = response['error_description'] #if errors + access_token = response['access_token'] + + with open('temp_oauth_token.txt', 'w') as f: + json.dump(access_token, f) + + return access_token + +class FindingApi: + ''' + Methods for accessing eBay's FindingApi services + ''' + + def __init__(self, service): + self.service = [ + 'findItemsAdvanced', 'findCompletedItems', + 'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory', + 'findItemsByProduct' + ][service] # Currently using only index 4, i.e., service = 4 + + def get_data(self, category_id): + + ''' + Gets raw JSON data fom FindingApi service call. Currently being used to + get itemIDs from categories; + ''' +# startTime = dateutil.parser.isoparse( startTime ) +# now = datetime.datetime.now(tz=pytz.UTC) +# days_on_site = (now - startTime).days # as int + + ids = [] + params = { + "OPERATION-NAME":self.service, + "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], + "SERVICE-VERSION":"1.13.0", + "RESPONSE-DATA-FORMAT":"JSON", + "categoryId":category_id, + "paginationInput.entriesPerPage":"100", + "paginationInput.PageNumber":"1", + "itemFilter(0).name":"Condition", + "itemFilter(0).value":"Used", + "itemFilter.name":"HideDuplicateItems", + "itemFilter.value":"true", + "sortOrder":"StartTimeNewest", + } +# "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here +# "itemFilter(1).value":"true" + + try: + response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", + params=params, timeout=24) + response.raise_for_status() + + except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout? + print('connection error') + return ids + try: + data = response.json() + for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: + ids.append(item['itemId'][0]) + + ids = list(set(ids)) + + except (AttributeError, KeyError): + print('AttributeError or KeyError. Exiting') + print(response.json()) + return ids + + return ids + +# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe +# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a +# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting +# it to the first 5 pictures instead of random. + +# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used +# to see which one is more accurate or if a combo of both is more accurate. + + def get_ids_from_cats(self): + ''' + Creates a 20-itemId list to use for the ShoppingApi + call + ''' + + ids = [] + + # load category id list + with open('cat_list.txt') as jf: + cat_list = json.load(jf) + + # load list of master ids + with open('master_ids.txt') as f: + master_ids = json.load(f) + + # fetch ids with calls to Finding Api given cats as param + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(self.get_data, cat_list): + ids.extend(future) + + # append master ids list with temporary ids from single function call and save + master_ids.extend(ids) + master_ids = list(set(master_ids)) + with open('master_ids.txt', 'w') as f: + json.dump(master_ids, f) + + # 20-ItemID list created to maximize dataset/decrease calls provided call constraints + twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0, + len(ids), 20))] + + return twenty_id_list, ids + +class ShoppingApi: + ''' + Creates objects from ShoppingApi service calls that can interact with + pandas dataframes + ''' + +# def __init__(self): +# +# # renew oauth token +# oauth_response = getAuthToken() +# access_token = oauth_response[0] +# +# self.access_token = access_token + + def update_cats(self): + ''' + Updates cat_list.txt + ''' + + parent_cats = ['3034', '93427'] # Women's and Men's shoe departments + cat_list = [] + # TODO make sep lists for women's and men's shoe cats. Needed to train + # mens and women's cats separately. This might improve val. acc. during training + + with open('temp_oauth_token.txt') as f: + access_token = json.load(f) + for department in parent_cats: + + headers = { + "X-EBAY-API-IAF-TOKEN":access_token, + "version":"671", + } + + url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department + + try: + response = requests.get(url, headers=headers, timeout=4) + response.raise_for_status() + + except requests.exceptions.RequestException: + print('connection error') + + response = response.json() + parent_cat = response['CategoryArray']['Category'][0] + response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept. + + temp_cat_list = [cat['CategoryID'] for cat in response] + if parent_cat == '3034': + women_cats = temp_cat_list + elif parent_cat = '93427': + men_cats = temp_cat_list + cat_list.extend(temp_cat_list) + + with open('cat_list.txt', 'w') as f: + json.dump(cat_list, f) + with open('women_cat_list.txt', 'w') as f: + json.dump(women_cats, f) + with open('men_cat_list.txt', 'w') as f: + json.dump(men_cats, f) + + def get_item_from_findItemsByCategory(self, twenty_id): + + ''' + Gets raw JSON data from multiple live listings given multiple itemIds + ''' + with open('temp_oauth_token.txt') as f: + access_token = json.load(f) + + headers = { + "X-EBAY-API-IAF-TOKEN":access_token, + "version":"671", + } + + url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id + + try: + + response = requests.get(url, headers=headers,timeout=24) + response.raise_for_status() + response = response.json() + item = response['Item'] + + + except (requests.exceptions.RequestException, KeyError): + print('connection error. IP limit possibly exceeded') + print(response) + return # returns NoneType. Handled at conky() + + return item + + def conky(self, twenty_ids_list): + ''' + Runs get_item_from_findItemsByCategory in multiple threads to get relevant + data for creating training sets + ''' + try: + with open('raw_data.txt') as f: + data = json.load(f) + except (FileNotFoundError, ValueError): + data = [] + + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list): + if future is not None: + for item in future: + data.append(item) # The end result should be a list of dicts where each dict in the list is a listing + else: + print('response is None') + break + with open('raw_data.txt', 'w') as f: + json.dump(data, f) + return data + +# NOTE: + +# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items +# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have +# to divide these up into the categories. This will leave you with about 6.25K results per cat. +# More than enough data for your dataset. + + +class CurateData: + ''' + Contains methods for curating data for machine learning training sets; + Takes item in data from ShoppingApi request as argument and extracts/ creates key + value pairs that gets updated to custom dataframe used in Ml training sets. + ''' + + def import_raw(self): + ''' + imports raw response json from local file. This is data from + GetMultipleItems call in ShoppingApi + ''' + with open('raw_data.txt') as f: + raw_data = json.load(f) + return raw_data + + def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason + ''' + creates pandas df from raw json and saves master raw csv file as raw_df.csv. + Indended to be used inline with direct + data stream from ebay's APIs + ''' + to_json = json.dumps(raw_data) + raw_df = pd.read_json(StringIO(to_json)) + raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file + #raw_df = pd.read_csv('raw_df.csv', index_col=0) + #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this + #raw_df.to_csv('raw_df.csv') + + # TODO still saving "Unnamed:0" column + + return raw_df + + def to_training(self, raw_data): + ''' + creates first pass of potential labels for training set. This is the base + df used to produce other training sets to use. + ''' + raw_df = self.raw_df(raw_data) + interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] + interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str) + training = interm_df1.dropna(subset=['ItemSpecifics']) + return training # TODO RENAME THIS FUNC AND its RETURN VALUE + + def class_training(self, training): + '''Training set for multiclass portion of training set. Used to train + seprately from multilabel portion + ''' + class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] + return class_training + + def nvl_training(self, training): + ''' + Training set for multilabel portion + ''' + interm_df1 = pd.Series(training.ItemSpecifics) + interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) + + # Necessary for json_normalize(): + + nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) + nvl_df = pd.json_normalize(nvl_dict) + nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) + + return nvl_training + + def extract_df(self, df): + ''' + converts single-value lists of strings of any df to string if not null + ''' + extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x) + + return extracted_df + + def drop_nvl_cols(self, nvl_training): + + with open('cat_spacs.txt') as f: + cat_spacs = json.load(f) + + drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', + 'Personalize', 'Fabric Type', 'Customized','Release Year', + 'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade', + 'Signed', 'Silhouette', 'Insole Material', 'Lining Material', + 'California Prop 65 Warning', 'Character Family', 'Character', + 'Cushioning Level', 'Personalization Instructions', 'Pronation', + ] + drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features', + 'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', + 'Country/Region of Manufacture', 'Brand', 'Model'] + for cat in drop : + if cat in cat_spacs: + cat_spacs.remove(cat) + for cat in drop_2: + if cat in cat_spacs: + cat_spacs.remove(cat) + + user_input = input('drop cols? (y,n; default=y): ') + + if 'n' in user_input: + dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels + else: + cols = [] + for col in cat_spacs: + if col in list(nvl_training.columns): + cols.append(col) + cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training + dropd = nvl_training[cols] + + return dropd + +# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which +# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. + +# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type) + + def expand_nvlclass(self, class_training, dropd): + ''' + takes image url list from each cell and expands them into separate/duplicate + instances. Modifies both class training and dropd dfs. Appends custom + image url dict {'source':'target'}. + * consider applying this function to other cells that have multiple values in their lists + ''' + expand = input("expand image list or use primary listing image? (y or n): ") + if ('y' or 'Y') in expand: + expanded_class = class_training.explode('PictureURL').reset_index(drop=True) + expanded_class = expanded_class.dropna(subset=['PictureURL']) + expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + + expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) + expanded_dropd = expanded_dropd.dropna(subset=['PictureURL']) + expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) + + expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values + + temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) + + else: + class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) + expanded_class = class_training.dropna() + dropd = dropd.dropna(subset=['PictureURL']) + dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) + dropd = dropd.dropna(subset=['PictureURL']) + expanded_dropd = dropd + + expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values + + # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt + temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) + + try: + with open('temp_pics_source_list.txt') as f: + tpsl = json.load(f) + tpsl.extend(temp_pics_source_list) + + # ensures no duplicate source URLs exist + temp_pics_source_list = list(set(tpsl)) + with open('temp_pics_source_list.txt', 'w') as f: + json.dump(temp_pics_source_list, f) + + # creates file if script is ran for 1st time and file not present + except (ValueError, FileNotFoundError): + with open('temp_pics_source_list.txt', 'w') as f: + json.dump(temp_pics_source_list, f) + + # Append to master training dataframes, drop potential dupes and save + expanded_class.to_csv('expanded_class.csv') + expanded_dropd.to_csv('expanded_dropd.csv') + + return expanded_class, expanded_dropd + + def dl_pic(self,dict_pics, pic): + + try: + + # check if image exists in current working directory. avoids dupes + if os.path.exists(dict_pics[pic]): + pass + + else: + + try: + + r = requests.get(pic, stream=True) + r.raw.decode_content = True + with open(dict_pics[pic], 'wb') as f: + shutil.copyfileobj(r.raw, f) + + except ConnectionError: + return + + except KeyError: + pass + + def dict_pics(self): + + try: + with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments + target_dir = json.load(f) + + except (ValueError, FileNotFoundError): + target_dir = input('No target dirctory found. Create One? [y] or [n]:') + if target_dir == ('y' or 'Y'): + target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here + with open('target_dirs.txt','w') as f: + json.dump(target_dir, f) + + else: + os.mkdir(os.getcwd()+os.sep+'training_images') + target_dir = os.getcwd()+os.sep+'training_images' + with open('target_dirs.txt','w') as f: + json.dump(target_dir, f) + print('Creating default folder in current directory @ ' + target_dir) + + # open url list in working directory + with open('temp_pics_source_list.txt') as f: + + try: + temp_pics_source_list = json.load(f) + + except (ValueError, FileNotFoundError): + print('url list not found. aborting') + return + + dict_pics = {} + + # make custom dict, {source:target}, and name images from unique URL patt + for k in temp_pics_source_list: + patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE) + patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE) + if patt_1 and patt_2 is not None: + tag = patt_1.group() + patt_2.group().lower() + file_name = target_dir + os.sep + tag + dict_pics.update({k:file_name}) + + with open('dict_pics.txt', 'w') as f: + json.dump(dict_pics, f) + + return dict_pics # TODO still need to find sol to outliers (aka, naming scheme for unusual source URLs) + + def dl_pictures(self, *dict_pics): + ''' + Downloads pictures from api to local storage using temp_pics_source_list + and dict_pics + ''' + + if not dict_pics: + dict_pics = self.dict_pics() + + with open('temp_pics_source_list.txt') as f: + try: + temp_pics_source_list = json.load(f) + except (ValueError, FileNotFoundError): + print('url list not found. download aborted') + return + + bargs = [(dict_pics, pic) for pic in temp_pics_source_list] + with concurrent.futures.ThreadPoolExecutor() as executor: + for future in executor.map(lambda p: self.dl_pic(*p), bargs): + if future is not None: + future + else: + print('connection error') + +class PreProcessing: + ''' + Includes methods for pre-processing training set input and labels in the + training set created from CurateData class. Whereas CurateData training + sets provided trimmed down data from the raw json response from the + ShoppingApi call and provided a bare minimum format for the dataframe to be + used in training, PreProcessing optimizes that dataframe for training and + includes methods for image manipulation, creating test/train/validation + splits, etc. + ''' + + def dict_pics(self): + ''' + Source to target training. Replaces source image URL with target URL + determined by values in dict_pics variable. + ''' + + target_dir = os.getcwd() + with open('temp_pics_source_list.txt') as f: + temp_pics_source_list = json.load(f) + dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} + print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images') + return dict_pics + + # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv + # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures + # if not exists and append to master img download dict + # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted + +def main(): + ''' + Main program creates/updates a csv file to use for ML training from live + ebay listings + ''' + pass +# main goes here: + +if __name__ == "__main__": + main() + +''' +Based on your sample set of 10 images, if you have an average of 5 images per +listing and you download a hundred listings, you will have about 102 Gb of +image data. That's just for one day. If you have more than a million listings +you're looking at a little over 1Tb of image data. You don't even know if this +is good data yet. +''' diff --git a/Shoe Classifier_Xception.ipynb b/Shoe Classifier_Xception.ipynb index ae4eb7f..d9d36a5 100644 --- a/Shoe Classifier_Xception.ipynb +++ b/Shoe Classifier_Xception.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "572dc7fb", "metadata": {}, "outputs": [], @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "id": "8d94196d", "metadata": {}, "outputs": [], @@ -68,70 +68,106 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "id": "a5c72863", "metadata": {}, "outputs": [], "source": [ - "# image_faults.faulty_images() # removes faulty images\n", + "image_faults.faulty_images() # removes faulty images\n", "df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def dict_pics_jup():\n", + " '''\n", + " {source:target} dict used to replace source urls with image location as input\n", + " '''\n", + " target_dir = os.getcwd() + os.sep + \"training_images\"\n", + " with open('temp_pics_source_list.txt') as f:\n", + " temp_pics_source_list = json.load(f)\n", + " \n", + " dict_pics = {}\n", + " for k in temp_pics_source_list:\n", + " patt_1 = re.search(r'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))', k, re.IGNORECASE)\n", + " patt_2 = re.search(r'(\\.jpg|\\.jpeg|\\.png)', k, re.IGNORECASE)\n", + " if patt_1 and patt_2 is not None:\n", + " tag = patt_1.group() + patt_2.group().lower()\n", + " file_name = target_dir + os.sep + tag\n", + " dict_pics.update({k:file_name})\n", + " print(\"{source:target} dictionary created @ \" + target_dir)\n", + " return dict_pics\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "1057a442", "metadata": { "scrolled": true }, "outputs": [ { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'group'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mblah\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPictureURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PictureURL'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'" + "name": "stdout", + "output_type": "stream", + "text": [ + "{source:target} dictionary created @ /tf/training_images\n" ] } ], "source": [ - "def dict_pics_jup(): # \n", - " target_dir = os.getcwd() + os.sep + \"training_images\"\n", - " with open('temp_pics_source_list.txt') as f:\n", - " temp_pics_source_list = json.load(f)\n", - " dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}\n", - " print(\"{source:target} dictionary created @ \" + target_dir)\n", - " return dict_pics\n", - "\n", "dict_pics = dict_pics_jup()\n", + "with open('temp_pics_source_list.txt') as f:\n", + " tempics = json.load(f)\n", + "# list of image urls that did not get named properly which will be removed from the dataframe\n", + "drop_row_vals = []\n", + "for pic in tempics:\n", + " try:\n", + " dict_pics[pic]\n", + " except KeyError:\n", + " drop_row_vals.append(pic)\n", + " \n", + "df = df[df.PictureURL.isin(drop_row_vals)==False]\n", + "# TODO drop men's or women's categories here\n", "blah = pd.Series(df.PictureURL)\n", "df = df.drop(labels=['PictureURL'], axis=1)\n", + "\n", "blah = blah.apply(lambda x: dict_pics[x])\n", "df = pd.concat([blah, df],axis=1)\n", - "df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n", - "# removes non-existent image paths" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7a6146e6", - "metadata": {}, - "outputs": [], - "source": [ - "df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n", - "\n", - "df=df.sample(frac=1)" + "df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers" ] }, { "cell_type": "code", "execution_count": 6, + "id": "7a6146e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n", + "\n", + "df=df.sample(frac=1)\n", + "len(drop_row_vals)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "114cc3c0", "metadata": {}, "outputs": [], @@ -143,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "506aa5cf", "metadata": {}, "outputs": [], @@ -155,18 +191,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "4d72eb90", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 5110 validated image filenames belonging to 13 classes.\n", - "Found 1277 validated image filenames belonging to 13 classes.\n" - ] - }, { "name": "stderr", "output_type": "stream", @@ -174,6 +202,14 @@ "/usr/local/lib/python3.8/dist-packages/keras_preprocessing/image/dataframe_iterator.py:279: UserWarning: Found 1 invalid image filename(s) in x_col=\"PictureURL\". These filename(s) will be ignored.\n", " warnings.warn(\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 53005 validated image filenames belonging to 13 classes.\n", + "Found 13251 validated image filenames belonging to 13 classes.\n" + ] } ], "source": [ @@ -212,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "7b70f37f", "metadata": {}, "outputs": [], @@ -222,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "1ed54bf5", "metadata": {}, "outputs": [], @@ -239,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "85934565", "metadata": {}, "outputs": [], @@ -250,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "6322bcad", "metadata": {}, "outputs": [ @@ -270,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "07fd25c6", "metadata": {}, "outputs": [], @@ -283,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "b31af79e", "metadata": {}, "outputs": [], @@ -294,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "fe06f2bf", "metadata": {}, "outputs": [ @@ -728,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "ea620129", "metadata": {}, "outputs": [], @@ -741,7 +777,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "fd5d1246", "metadata": {}, "outputs": [], @@ -753,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "9cd2ba27", "metadata": { "scrolled": false @@ -764,35 +800,13 @@ "output_type": "stream", "text": [ "Epoch 1/30\n", - "80/80 [==============================] - 78s 913ms/step - loss: 1.8419 - accuracy: 0.4153 - val_loss: 2.6144 - val_accuracy: 0.0720\n", + "829/829 [==============================] - 786s 942ms/step - loss: 1.5037 - accuracy: 0.4896 - val_loss: 1.2946 - val_accuracy: 0.5520\n", "Epoch 2/30\n", - "80/80 [==============================] - 69s 862ms/step - loss: 0.9255 - accuracy: 0.7121 - val_loss: 2.5804 - val_accuracy: 0.1026\n", + "829/829 [==============================] - 726s 875ms/step - loss: 0.8550 - accuracy: 0.7117 - val_loss: 1.3593 - val_accuracy: 0.5593\n", "Epoch 3/30\n", - "80/80 [==============================] - 72s 900ms/step - loss: 0.4003 - accuracy: 0.9092 - val_loss: 2.4443 - val_accuracy: 0.2310\n", + "829/829 [==============================] - 750s 905ms/step - loss: 0.3322 - accuracy: 0.8993 - val_loss: 1.5304 - val_accuracy: 0.5542\n", "Epoch 4/30\n", - "80/80 [==============================] - 71s 888ms/step - loss: 0.1224 - accuracy: 0.9847 - val_loss: 2.1299 - val_accuracy: 0.3782\n", - "Epoch 5/30\n", - "80/80 [==============================] - 75s 935ms/step - loss: 0.0371 - accuracy: 0.9975 - val_loss: 1.7368 - val_accuracy: 0.4973\n", - "Epoch 6/30\n", - "80/80 [==============================] - 73s 900ms/step - loss: 0.0167 - accuracy: 0.9992 - val_loss: 1.6747 - val_accuracy: 0.5341\n", - "Epoch 7/30\n", - "80/80 [==============================] - 72s 897ms/step - loss: 0.0097 - accuracy: 0.9998 - val_loss: 1.6494 - val_accuracy: 0.5442\n", - "Epoch 8/30\n", - "80/80 [==============================] - 74s 915ms/step - loss: 0.0062 - accuracy: 0.9998 - val_loss: 1.6659 - val_accuracy: 0.5568\n", - "Epoch 9/30\n", - "80/80 [==============================] - 74s 917ms/step - loss: 0.0044 - accuracy: 1.0000 - val_loss: 1.7088 - val_accuracy: 0.5615\n", - "Epoch 10/30\n", - "80/80 [==============================] - 70s 868ms/step - loss: 0.0035 - accuracy: 1.0000 - val_loss: 1.7540 - val_accuracy: 0.5583\n", - "Epoch 11/30\n", - "80/80 [==============================] - 70s 864ms/step - loss: 0.0027 - accuracy: 0.9998 - val_loss: 1.7894 - val_accuracy: 0.5552\n", - "Epoch 12/30\n", - "80/80 [==============================] - 69s 858ms/step - loss: 0.0020 - accuracy: 1.0000 - val_loss: 1.8126 - val_accuracy: 0.5536\n", - "Epoch 13/30\n", - "80/80 [==============================] - 69s 857ms/step - loss: 0.0019 - accuracy: 1.0000 - val_loss: 1.8496 - val_accuracy: 0.5544\n", - "Epoch 14/30\n", - "80/80 [==============================] - 69s 859ms/step - loss: 0.0015 - accuracy: 1.0000 - val_loss: 1.8646 - val_accuracy: 0.5544\n", - "Epoch 15/30\n", - "30/80 [==========>...................] - ETA: 36s - loss: 0.0011 - accuracy: 1.0000" + "172/829 [=====>........................] - ETA: 7:57 - loss: 0.1030 - accuracy: 0.9787" ] }, { @@ -802,7 +816,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m model.fit(x=train_generator,\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mvalidation_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m model.fit(x=train_generator,\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mvalidation_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py\u001b[0m in \u001b[0;36merror_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mfiltered_tb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 65\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint: disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0mfiltered_tb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_process_traceback_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[1;32m 1219\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtmp_logs\u001b[0m \u001b[0;31m# No error, now safe to assign to logs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1220\u001b[0m \u001b[0mend_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdata_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep_increment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1221\u001b[0;31m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_train_batch_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstop_training\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1223\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/callbacks.py\u001b[0m in \u001b[0;36mon_train_batch_end\u001b[0;34m(self, batch, logs)\u001b[0m\n\u001b[1;32m 434\u001b[0m \"\"\"\n\u001b[1;32m 435\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_should_call_train_batch_hooks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 436\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_batch_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mModeKeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'end'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 437\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mon_test_batch_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", diff --git a/ebay_api.py b/ebay_api.py index 47fc166..9a77e43 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -26,9 +26,9 @@ from ebaysdk.shopping import Connection as Shopping # renew oauth token for shopping api def getAuthToken(): AppSettings = { - 'client_id': cfg.oauth.client_id, - 'client_secret':cfg.oauth.client_secret, - 'ruName':cfg.oauth.RuName + 'client_id': cfg.oauth["client_id"], + 'client_secret':cfg.oauth["client_secret"], + 'ruName':cfg.oauth["RuName"] } authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret'] @@ -50,14 +50,14 @@ def getAuthToken(): tokenURL = "https://api.ebay.com/identity/v1/oauth2/token" - response = requests.post(tokenURL, headers=headers, data=data) - error = response['error_description'] #if errors - access_token = response.json()['access_token'] + response = requests.post(tokenURL, headers=headers, data=data).json() +# error = response['error_description'] #if errors + access_token = response['access_token'] - with open('temp_oath_token.txt', 'w') as f: + with open('temp_oauth_token.txt', 'w') as f: json.dump(access_token, f) - return access_token, error + return access_token class FindingApi: ''' @@ -135,27 +135,32 @@ class FindingApi: call ''' - itemid_results_list = [] + ids = [] + # load category id list with open('cat_list.txt') as jf: cat_list = json.load(jf) + # load list of master ids + with open('master_ids.txt') as f: + master_ids = json.load(f) + + # fetch ids with calls to Finding Api given cats as param with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(self.get_data, cat_list): - itemid_results_list.extend(future) + ids.extend(future) - print(len(itemid_results_list)) - a = list(set(itemid_results_list)) - print(len(a)) + # append master ids list with temporary ids from single function call and save + master_ids.extend(ids) + master_ids = list(set(master_ids)) + with open('master_ids.txt', 'w') as f: + json.dump(master_ids, f) - with open('raw_ids.txt', 'w') as f: - json.dump(itemid_results_list, f) + # 20-ItemID list created to maximize dataset/decrease calls provided call constraints + twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0, + len(ids), 20))] - # 20-ItemID list created to maximize dataset/decrease calls given call constraints - item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0, - len(itemid_results_list), 20))] - - return item_id_results, itemid_results_list + return twenty_id_list, ids class ShoppingApi: ''' @@ -163,11 +168,13 @@ class ShoppingApi: pandas dataframes ''' - def __init__(self): - - # renew oauth token - access_token = getAuthToken()[0] - self.access_token = access_token +# def __init__(self): +# +# # renew oauth token +# oauth_response = getAuthToken() +# access_token = oauth_response[0] +# +# self.access_token = access_token def update_cats(self): ''' @@ -176,11 +183,15 @@ class ShoppingApi: parent_cats = ['3034', '93427'] # Women's and Men's shoe departments cat_list = [] + # TODO make sep lists for women's and men's shoe cats. Needed to train + # mens and women's cats separately. This might improve val. acc. during training + with open('temp_oauth_token.txt') as f: + access_token = json.load(f) for department in parent_cats: headers = { - "X-EBAY-API-IAF-TOKEN":self.access_token, + "X-EBAY-API-IAF-TOKEN":access_token, "version":"671", } @@ -197,19 +208,31 @@ class ShoppingApi: response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept. temp_cat_list = [cat['CategoryID'] for cat in response] + + if department == '3034': + women_cats = temp_cat_list + elif department == '93427': + men_cats = temp_cat_list + cat_list.extend(temp_cat_list) - with open('cat_list.txt', 'w') as f: - json.dump(cat_list, f) + with open('cat_list.txt', 'w') as f: + json.dump(cat_list, f) + with open('women_cat_list.txt', 'w') as f: + json.dump(women_cats, f) + with open('men_cat_list.txt', 'w') as f: + json.dump(men_cats, f) def get_item_from_findItemsByCategory(self, twenty_id): ''' Gets raw JSON data from multiple live listings given multiple itemIds ''' + with open('temp_oauth_token.txt') as f: + access_token = json.load(f) headers = { - "X-EBAY-API-IAF-TOKEN":self.access_token, + "X-EBAY-API-IAF-TOKEN":access_token, "version":"671", } diff --git a/image_faults.py b/image_faults.py index e90294c..8aa6382 100644 --- a/image_faults.py +++ b/image_faults.py @@ -17,7 +17,7 @@ def faulty_images(): img = PIL.Image.open(img_p) except PIL.UnidentifiedImageError: os.remove(img_p) - print(img_p + "Removed") +# print(img_p + "Removed") # remove from folder, dataset(is constructed from the csv files # ), dict_pics, temp_pics_source_list, # expanded_dropd, expanded_class. But, remember that if you run curate.py diff --git a/shopping.py b/shopping.py deleted file mode 100644 index 4008658..0000000 --- a/shopping.py +++ /dev/null @@ -1,7 +0,0 @@ -''' -Initial download and write of raw data from ebay -''' -import ebay_api - -shopping = ebay_api.ShoppingApi() -data = shopping.conky() diff --git a/update_dataset.py b/update_dataset.py new file mode 100644 index 0000000..efcc0a8 --- /dev/null +++ b/update_dataset.py @@ -0,0 +1,13 @@ +''' +Update dataset; instantiates FindingApi and makes call to eBay's Finding Api +using the findItemsByCategory service. Updates the master_ids list and raw_data. +''' +import ebay_api + +# Make call to ebay Finding service and return list of twenty_id strings +finding = ebay_api.FindingApi(4) # 4 is URL paramter for used items +twenty_id_list = finding.get_ids_from_cats()[0] + +# renew oauth token and make call to shopping service to get item data and write to local file +shopping = ebay_api.ShoppingApi() +data = shopping.conky(twenty_id_list)