fixed oauth renewal errors. added functionality for retrieving list of mens and womens cats separately
This commit is contained in:
		
							
								
								
									
										595
									
								
								:w
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										595
									
								
								:w
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,595 @@
 | 
			
		||||
import os
 | 
			
		||||
from time import sleep
 | 
			
		||||
from random import randint
 | 
			
		||||
import scrape_ids
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
import dateutil
 | 
			
		||||
from dateutil import parser
 | 
			
		||||
import pytz
 | 
			
		||||
import pdb
 | 
			
		||||
from io import StringIO
 | 
			
		||||
import numpy as np
 | 
			
		||||
import concurrent.futures
 | 
			
		||||
import json
 | 
			
		||||
import requests
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import config as cfg
 | 
			
		||||
import shutil
 | 
			
		||||
import re
 | 
			
		||||
import urllib, base64
 | 
			
		||||
 | 
			
		||||
from ebaysdk.exception import ConnectionError
 | 
			
		||||
from ebaysdk.trading import Connection as Trading
 | 
			
		||||
from ebaysdk.finding import Connection as Finding
 | 
			
		||||
from ebaysdk.shopping import Connection as Shopping
 | 
			
		||||
 | 
			
		||||
# renew oauth token for shopping api
 | 
			
		||||
def getAuthToken():
 | 
			
		||||
     AppSettings = {
 | 
			
		||||
          'client_id': cfg.oauth["client_id"],
 | 
			
		||||
          'client_secret':cfg.oauth["client_secret"],
 | 
			
		||||
          'ruName':cfg.oauth["RuName"]
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
     authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret']
 | 
			
		||||
     encodedAuthHeader = base64.b64encode(str.encode(authHeaderData))
 | 
			
		||||
     encodedAuthHeader = str(encodedAuthHeader)[2:len(str(encodedAuthHeader))-1]
 | 
			
		||||
 | 
			
		||||
     headers = {
 | 
			
		||||
          "Content-Type" : "application/x-www-form-urlencoded", # what is this?
 | 
			
		||||
          "Authorization" : "Basic " + str(encodedAuthHeader)
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
     body= {
 | 
			
		||||
          "grant_type" : "client_credentials",
 | 
			
		||||
          "redirect_uri" : AppSettings['ruName'],
 | 
			
		||||
          "scope" : "https://api.ebay.com/oauth/api_scope"
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
     data = urllib.parse.urlencode(body)
 | 
			
		||||
 | 
			
		||||
     tokenURL = "https://api.ebay.com/identity/v1/oauth2/token"
 | 
			
		||||
 | 
			
		||||
     response = requests.post(tokenURL, headers=headers, data=data).json()
 | 
			
		||||
#     error = response['error_description'] #if errors
 | 
			
		||||
     access_token = response['access_token']
 | 
			
		||||
 | 
			
		||||
     with open('temp_oauth_token.txt', 'w') as f:
 | 
			
		||||
         json.dump(access_token, f)
 | 
			
		||||
 | 
			
		||||
     return access_token
 | 
			
		||||
 | 
			
		||||
class FindingApi:
 | 
			
		||||
    '''
 | 
			
		||||
    Methods for accessing eBay's FindingApi services
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    def __init__(self, service):
 | 
			
		||||
        self.service = [
 | 
			
		||||
            'findItemsAdvanced', 'findCompletedItems',
 | 
			
		||||
            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
 | 
			
		||||
            'findItemsByProduct'
 | 
			
		||||
            ][service] # Currently using only index 4, i.e., service = 4
 | 
			
		||||
 | 
			
		||||
    def get_data(self, category_id):
 | 
			
		||||
 | 
			
		||||
        '''
 | 
			
		||||
        Gets raw JSON data fom FindingApi service call. Currently being used to
 | 
			
		||||
        get itemIDs from categories;
 | 
			
		||||
        '''
 | 
			
		||||
#        startTime = dateutil.parser.isoparse( startTime )
 | 
			
		||||
#        now = datetime.datetime.now(tz=pytz.UTC)
 | 
			
		||||
#        days_on_site = (now - startTime).days # as int
 | 
			
		||||
 | 
			
		||||
        ids = []
 | 
			
		||||
        params = {
 | 
			
		||||
            "OPERATION-NAME":self.service,
 | 
			
		||||
            "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
 | 
			
		||||
            "SERVICE-VERSION":"1.13.0",
 | 
			
		||||
            "RESPONSE-DATA-FORMAT":"JSON",
 | 
			
		||||
            "categoryId":category_id,
 | 
			
		||||
            "paginationInput.entriesPerPage":"100",
 | 
			
		||||
            "paginationInput.PageNumber":"1",
 | 
			
		||||
            "itemFilter(0).name":"Condition",
 | 
			
		||||
            "itemFilter(0).value":"Used",
 | 
			
		||||
            "itemFilter.name":"HideDuplicateItems",
 | 
			
		||||
            "itemFilter.value":"true",
 | 
			
		||||
            "sortOrder":"StartTimeNewest",
 | 
			
		||||
            }
 | 
			
		||||
#            "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here
 | 
			
		||||
#            "itemFilter(1).value":"true"
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
 | 
			
		||||
                params=params, timeout=24)
 | 
			
		||||
            response.raise_for_status()
 | 
			
		||||
 | 
			
		||||
        except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
 | 
			
		||||
            print('connection error')
 | 
			
		||||
            return ids
 | 
			
		||||
        try:
 | 
			
		||||
            data = response.json()
 | 
			
		||||
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
 | 
			
		||||
                ids.append(item['itemId'][0])
 | 
			
		||||
 | 
			
		||||
            ids = list(set(ids))
 | 
			
		||||
 | 
			
		||||
        except (AttributeError, KeyError):
 | 
			
		||||
            print('AttributeError or KeyError. Exiting')
 | 
			
		||||
            print(response.json())
 | 
			
		||||
            return ids
 | 
			
		||||
 | 
			
		||||
        return ids
 | 
			
		||||
 | 
			
		||||
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe 
 | 
			
		||||
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
 | 
			
		||||
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
 | 
			
		||||
# it to the first 5 pictures instead of random. 
 | 
			
		||||
 | 
			
		||||
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
 | 
			
		||||
# to see which one is more accurate or if a combo of both is more accurate. 
 | 
			
		||||
 | 
			
		||||
    def get_ids_from_cats(self):
 | 
			
		||||
        '''
 | 
			
		||||
        Creates a 20-itemId list to use for the ShoppingApi
 | 
			
		||||
        call
 | 
			
		||||
        '''
 | 
			
		||||
 | 
			
		||||
        ids = []
 | 
			
		||||
 | 
			
		||||
        # load category id list
 | 
			
		||||
        with open('cat_list.txt') as jf:
 | 
			
		||||
            cat_list = json.load(jf)
 | 
			
		||||
 | 
			
		||||
        # load list of master ids
 | 
			
		||||
        with open('master_ids.txt') as f:
 | 
			
		||||
            master_ids = json.load(f)
 | 
			
		||||
 | 
			
		||||
        # fetch ids with calls to Finding Api given cats as param
 | 
			
		||||
        with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
            for future in executor.map(self.get_data, cat_list):
 | 
			
		||||
                ids.extend(future)
 | 
			
		||||
 | 
			
		||||
        # append master ids list with temporary ids from single function call and save
 | 
			
		||||
        master_ids.extend(ids)
 | 
			
		||||
        master_ids = list(set(master_ids))
 | 
			
		||||
        with open('master_ids.txt', 'w') as f:
 | 
			
		||||
            json.dump(master_ids, f)
 | 
			
		||||
 | 
			
		||||
 # 20-ItemID list created to maximize dataset/decrease calls provided call constraints
 | 
			
		||||
        twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0,
 | 
			
		||||
            len(ids), 20))]
 | 
			
		||||
 | 
			
		||||
        return twenty_id_list, ids
 | 
			
		||||
 | 
			
		||||
class ShoppingApi:
 | 
			
		||||
    '''
 | 
			
		||||
    Creates objects from ShoppingApi service calls that can interact with
 | 
			
		||||
    pandas dataframes
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
#    def __init__(self):
 | 
			
		||||
#        
 | 
			
		||||
#        # renew oauth token
 | 
			
		||||
#        oauth_response = getAuthToken()
 | 
			
		||||
#        access_token = oauth_response[0]
 | 
			
		||||
#        
 | 
			
		||||
#        self.access_token = access_token
 | 
			
		||||
 | 
			
		||||
    def update_cats(self):
 | 
			
		||||
        '''
 | 
			
		||||
        Updates cat_list.txt
 | 
			
		||||
        '''
 | 
			
		||||
 | 
			
		||||
        parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
 | 
			
		||||
        cat_list = []
 | 
			
		||||
        # TODO make sep lists for women's and men's shoe cats. Needed to train
 | 
			
		||||
        # mens and women's cats separately. This might improve val. acc. during training
 | 
			
		||||
 | 
			
		||||
        with open('temp_oauth_token.txt') as f:
 | 
			
		||||
            access_token = json.load(f)
 | 
			
		||||
        for department in parent_cats:
 | 
			
		||||
 | 
			
		||||
            headers = {
 | 
			
		||||
                "X-EBAY-API-IAF-TOKEN":access_token,
 | 
			
		||||
                "version":"671",
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                response = requests.get(url, headers=headers, timeout=4)
 | 
			
		||||
                response.raise_for_status()
 | 
			
		||||
 | 
			
		||||
            except requests.exceptions.RequestException:
 | 
			
		||||
                print('connection error')
 | 
			
		||||
 | 
			
		||||
            response = response.json()
 | 
			
		||||
            parent_cat = response['CategoryArray']['Category'][0] 
 | 
			
		||||
            response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept.
 | 
			
		||||
 | 
			
		||||
            temp_cat_list = [cat['CategoryID'] for cat in response]
 | 
			
		||||
            if parent_cat == '3034':
 | 
			
		||||
                women_cats = temp_cat_list
 | 
			
		||||
            elif parent_cat = '93427':
 | 
			
		||||
                men_cats = temp_cat_list
 | 
			
		||||
            cat_list.extend(temp_cat_list)
 | 
			
		||||
 | 
			
		||||
        with open('cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(cat_list, f)
 | 
			
		||||
        with open('women_cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(women_cats, f)
 | 
			
		||||
        with open('men_cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(men_cats, f)
 | 
			
		||||
 | 
			
		||||
    def get_item_from_findItemsByCategory(self, twenty_id):
 | 
			
		||||
 | 
			
		||||
        '''
 | 
			
		||||
       Gets raw JSON data from multiple live listings given multiple itemIds
 | 
			
		||||
        '''
 | 
			
		||||
        with open('temp_oauth_token.txt') as f:
 | 
			
		||||
            access_token = json.load(f)
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "X-EBAY-API-IAF-TOKEN":access_token,
 | 
			
		||||
            "version":"671",
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
        
 | 
			
		||||
            response = requests.get(url, headers=headers,timeout=24)
 | 
			
		||||
            response.raise_for_status()
 | 
			
		||||
            response = response.json()
 | 
			
		||||
            item = response['Item']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        except (requests.exceptions.RequestException, KeyError):
 | 
			
		||||
            print('connection error. IP limit possibly exceeded')
 | 
			
		||||
            print(response)
 | 
			
		||||
            return # returns NoneType. Handled at conky()
 | 
			
		||||
 | 
			
		||||
        return item
 | 
			
		||||
 | 
			
		||||
    def conky(self, twenty_ids_list):
 | 
			
		||||
        '''
 | 
			
		||||
        Runs get_item_from_findItemsByCategory in multiple threads to get relevant
 | 
			
		||||
        data for creating training sets
 | 
			
		||||
        '''
 | 
			
		||||
        try:
 | 
			
		||||
            with open('raw_data.txt') as f:
 | 
			
		||||
                data = json.load(f)
 | 
			
		||||
        except (FileNotFoundError, ValueError):
 | 
			
		||||
            data = []
 | 
			
		||||
 | 
			
		||||
        with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
            for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list):
 | 
			
		||||
                if future is not None:
 | 
			
		||||
                    for item in future:
 | 
			
		||||
                        data.append(item) # The end result should be a list of dicts where each dict in the list is a listing 
 | 
			
		||||
                else:
 | 
			
		||||
                    print('response is None')
 | 
			
		||||
                    break
 | 
			
		||||
        with open('raw_data.txt', 'w') as f:
 | 
			
		||||
            json.dump(data, f)
 | 
			
		||||
        return data
 | 
			
		||||
 | 
			
		||||
# NOTE:
 | 
			
		||||
 | 
			
		||||
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
 | 
			
		||||
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
 | 
			
		||||
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
 | 
			
		||||
# More than enough data for your dataset.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CurateData:
 | 
			
		||||
    '''
 | 
			
		||||
    Contains methods for curating data for machine learning training sets;
 | 
			
		||||
    Takes item in data from ShoppingApi request as argument and extracts/ creates key
 | 
			
		||||
    value pairs that gets updated to custom dataframe used in Ml training sets.
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    def import_raw(self):
 | 
			
		||||
        '''
 | 
			
		||||
        imports raw response json from local file. This is data from
 | 
			
		||||
        GetMultipleItems call in ShoppingApi
 | 
			
		||||
        '''
 | 
			
		||||
        with open('raw_data.txt') as f:
 | 
			
		||||
            raw_data = json.load(f)
 | 
			
		||||
            return raw_data
 | 
			
		||||
 | 
			
		||||
    def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
 | 
			
		||||
        '''
 | 
			
		||||
        creates pandas df from raw json and saves master raw csv file as raw_df.csv.
 | 
			
		||||
        Indended to be used inline with direct
 | 
			
		||||
        data stream from ebay's APIs
 | 
			
		||||
        '''
 | 
			
		||||
        to_json = json.dumps(raw_data)
 | 
			
		||||
        raw_df = pd.read_json(StringIO(to_json))
 | 
			
		||||
        raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
 | 
			
		||||
        #raw_df = pd.read_csv('raw_df.csv', index_col=0)
 | 
			
		||||
        #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
 | 
			
		||||
        #raw_df.to_csv('raw_df.csv')
 | 
			
		||||
 | 
			
		||||
        # TODO still saving "Unnamed:0" column
 | 
			
		||||
 | 
			
		||||
        return raw_df
 | 
			
		||||
 | 
			
		||||
    def to_training(self, raw_data):
 | 
			
		||||
        '''
 | 
			
		||||
        creates first pass of potential labels for training set. This is the base
 | 
			
		||||
        df used to produce other training sets to use.
 | 
			
		||||
        '''
 | 
			
		||||
        raw_df = self.raw_df(raw_data)
 | 
			
		||||
        interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
 | 
			
		||||
        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
 | 
			
		||||
        training = interm_df1.dropna(subset=['ItemSpecifics'])
 | 
			
		||||
        return training # TODO RENAME THIS FUNC AND its RETURN VALUE
 | 
			
		||||
 | 
			
		||||
    def class_training(self, training):
 | 
			
		||||
        '''Training set for multiclass portion of training set. Used to train
 | 
			
		||||
        seprately from multilabel portion
 | 
			
		||||
        '''
 | 
			
		||||
        class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
 | 
			
		||||
        return class_training
 | 
			
		||||
 | 
			
		||||
    def nvl_training(self, training):
 | 
			
		||||
        '''
 | 
			
		||||
        Training set for multilabel portion
 | 
			
		||||
        '''
 | 
			
		||||
        interm_df1 = pd.Series(training.ItemSpecifics)
 | 
			
		||||
        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
 | 
			
		||||
 | 
			
		||||
        # Necessary for json_normalize(): 
 | 
			
		||||
 | 
			
		||||
        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
 | 
			
		||||
        nvl_df = pd.json_normalize(nvl_dict)
 | 
			
		||||
        nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
 | 
			
		||||
 | 
			
		||||
        return nvl_training
 | 
			
		||||
 | 
			
		||||
    def extract_df(self, df):
 | 
			
		||||
        '''
 | 
			
		||||
        converts single-value lists of strings of any df to string if not null
 | 
			
		||||
        '''
 | 
			
		||||
        extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x)
 | 
			
		||||
 | 
			
		||||
        return extracted_df
 | 
			
		||||
 | 
			
		||||
    def drop_nvl_cols(self, nvl_training):
 | 
			
		||||
 | 
			
		||||
        with open('cat_spacs.txt') as f:
 | 
			
		||||
            cat_spacs = json.load(f)
 | 
			
		||||
 | 
			
		||||
        drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', 
 | 
			
		||||
                'Personalize', 'Fabric Type', 'Customized','Release Year',
 | 
			
		||||
                'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade',
 | 
			
		||||
                'Signed', 'Silhouette', 'Insole Material', 'Lining Material',
 | 
			
		||||
                'California Prop 65 Warning', 'Character Family', 'Character',
 | 
			
		||||
                'Cushioning Level', 'Personalization Instructions', 'Pronation',
 | 
			
		||||
                ]
 | 
			
		||||
        drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features',
 | 
			
		||||
                'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', 
 | 
			
		||||
                'Country/Region of Manufacture', 'Brand', 'Model']
 | 
			
		||||
        for cat in drop :
 | 
			
		||||
            if cat in cat_spacs:
 | 
			
		||||
                cat_spacs.remove(cat)
 | 
			
		||||
        for cat in drop_2:
 | 
			
		||||
            if cat in cat_spacs:
 | 
			
		||||
                cat_spacs.remove(cat)
 | 
			
		||||
 | 
			
		||||
        user_input = input('drop cols? (y,n; default=y): ')
 | 
			
		||||
 | 
			
		||||
        if 'n' in user_input:
 | 
			
		||||
            dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
 | 
			
		||||
        else:
 | 
			
		||||
            cols = []
 | 
			
		||||
            for col in cat_spacs:
 | 
			
		||||
                if col in list(nvl_training.columns):
 | 
			
		||||
                    cols.append(col)
 | 
			
		||||
            cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training
 | 
			
		||||
            dropd = nvl_training[cols]
 | 
			
		||||
 | 
			
		||||
        return dropd
 | 
			
		||||
 | 
			
		||||
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
 | 
			
		||||
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 
 | 
			
		||||
 | 
			
		||||
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
 | 
			
		||||
 | 
			
		||||
    def expand_nvlclass(self, class_training, dropd):
 | 
			
		||||
        '''
 | 
			
		||||
        takes image url list from each cell and expands them into separate/duplicate
 | 
			
		||||
        instances. Modifies both class training and dropd dfs. Appends custom
 | 
			
		||||
        image url dict {'source':'target'}.
 | 
			
		||||
        * consider applying this function to other cells that have multiple values in their lists
 | 
			
		||||
        '''
 | 
			
		||||
        expand = input("expand image list or use primary listing image? (y or n): ")
 | 
			
		||||
        if ('y' or 'Y') in expand:
 | 
			
		||||
            expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
 | 
			
		||||
            expanded_class = expanded_class.dropna(subset=['PictureURL'])
 | 
			
		||||
            expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
 | 
			
		||||
 | 
			
		||||
            expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
 | 
			
		||||
            expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
 | 
			
		||||
            expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
 | 
			
		||||
 | 
			
		||||
            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
 | 
			
		||||
 | 
			
		||||
            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) 
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
 | 
			
		||||
            expanded_class = class_training.dropna()
 | 
			
		||||
            dropd = dropd.dropna(subset=['PictureURL'])
 | 
			
		||||
            dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
 | 
			
		||||
            dropd = dropd.dropna(subset=['PictureURL'])
 | 
			
		||||
            expanded_dropd = dropd
 | 
			
		||||
 | 
			
		||||
            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
 | 
			
		||||
 | 
			
		||||
            # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
 | 
			
		||||
            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            with open('temp_pics_source_list.txt') as f:
 | 
			
		||||
                tpsl = json.load(f)
 | 
			
		||||
                tpsl.extend(temp_pics_source_list)
 | 
			
		||||
 | 
			
		||||
                # ensures no duplicate source URLs exist
 | 
			
		||||
                temp_pics_source_list = list(set(tpsl))
 | 
			
		||||
                with open('temp_pics_source_list.txt', 'w') as f:
 | 
			
		||||
                    json.dump(temp_pics_source_list, f)
 | 
			
		||||
 | 
			
		||||
        # creates file if script is ran for 1st time and file not present
 | 
			
		||||
        except (ValueError, FileNotFoundError):
 | 
			
		||||
            with open('temp_pics_source_list.txt', 'w') as f:
 | 
			
		||||
                json.dump(temp_pics_source_list, f)
 | 
			
		||||
 | 
			
		||||
        # Append to master training dataframes, drop potential dupes and save
 | 
			
		||||
        expanded_class.to_csv('expanded_class.csv')
 | 
			
		||||
        expanded_dropd.to_csv('expanded_dropd.csv')
 | 
			
		||||
 | 
			
		||||
        return expanded_class, expanded_dropd
 | 
			
		||||
 | 
			
		||||
    def dl_pic(self,dict_pics, pic):
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
 | 
			
		||||
            # check if image exists in current working directory. avoids dupes
 | 
			
		||||
            if os.path.exists(dict_pics[pic]):
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
            else:
 | 
			
		||||
 | 
			
		||||
                try:
 | 
			
		||||
 | 
			
		||||
                    r = requests.get(pic, stream=True)
 | 
			
		||||
                    r.raw.decode_content = True
 | 
			
		||||
                    with open(dict_pics[pic], 'wb') as  f:
 | 
			
		||||
                        shutil.copyfileobj(r.raw, f)
 | 
			
		||||
 | 
			
		||||
                except ConnectionError:
 | 
			
		||||
                    return
 | 
			
		||||
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
    def dict_pics(self):
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
 | 
			
		||||
                target_dir = json.load(f)
 | 
			
		||||
 | 
			
		||||
        except (ValueError, FileNotFoundError):
 | 
			
		||||
            target_dir = input('No target dirctory found. Create One? [y] or [n]:')
 | 
			
		||||
            if target_dir == ('y' or 'Y'):
 | 
			
		||||
                target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
 | 
			
		||||
                with open('target_dirs.txt','w') as f:
 | 
			
		||||
                    json.dump(target_dir, f)
 | 
			
		||||
                
 | 
			
		||||
            else:
 | 
			
		||||
                os.mkdir(os.getcwd()+os.sep+'training_images')
 | 
			
		||||
                target_dir = os.getcwd()+os.sep+'training_images'
 | 
			
		||||
                with open('target_dirs.txt','w') as f:
 | 
			
		||||
                    json.dump(target_dir, f)
 | 
			
		||||
                    print('Creating default folder in current directory @ ' + target_dir)
 | 
			
		||||
 | 
			
		||||
        # open url list in working directory
 | 
			
		||||
        with open('temp_pics_source_list.txt') as f:
 | 
			
		||||
            
 | 
			
		||||
            try:
 | 
			
		||||
                temp_pics_source_list = json.load(f)
 | 
			
		||||
 | 
			
		||||
            except (ValueError, FileNotFoundError):
 | 
			
		||||
                print('url list not found. aborting')
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
        dict_pics = {}
 | 
			
		||||
 | 
			
		||||
        # make custom dict, {source:target}, and name images from unique URL patt
 | 
			
		||||
        for k in temp_pics_source_list:
 | 
			
		||||
            patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE)
 | 
			
		||||
            patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE) 
 | 
			
		||||
            if patt_1 and patt_2 is not None:
 | 
			
		||||
                tag = patt_1.group() + patt_2.group().lower()
 | 
			
		||||
                file_name = target_dir + os.sep + tag
 | 
			
		||||
                dict_pics.update({k:file_name})
 | 
			
		||||
 | 
			
		||||
        with open('dict_pics.txt', 'w') as f:
 | 
			
		||||
            json.dump(dict_pics, f)
 | 
			
		||||
 | 
			
		||||
        return dict_pics # TODO still need to find sol to outliers (aka, naming scheme for unusual source URLs)
 | 
			
		||||
 | 
			
		||||
    def dl_pictures(self, *dict_pics):
 | 
			
		||||
        '''
 | 
			
		||||
        Downloads pictures from api to local storage using temp_pics_source_list
 | 
			
		||||
        and dict_pics
 | 
			
		||||
        '''
 | 
			
		||||
 | 
			
		||||
        if not dict_pics:
 | 
			
		||||
            dict_pics = self.dict_pics()
 | 
			
		||||
 | 
			
		||||
        with open('temp_pics_source_list.txt') as f:
 | 
			
		||||
            try:
 | 
			
		||||
                temp_pics_source_list = json.load(f)
 | 
			
		||||
            except (ValueError, FileNotFoundError):
 | 
			
		||||
                print('url list not found. download aborted')
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
 | 
			
		||||
        with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
            for future in executor.map(lambda p: self.dl_pic(*p), bargs):
 | 
			
		||||
                if future is not None:
 | 
			
		||||
                    future
 | 
			
		||||
                else:
 | 
			
		||||
                    print('connection error')
 | 
			
		||||
 | 
			
		||||
class PreProcessing:
 | 
			
		||||
    '''
 | 
			
		||||
    Includes methods for pre-processing training set input and labels in the
 | 
			
		||||
    training set created from CurateData class. Whereas CurateData training
 | 
			
		||||
    sets provided trimmed down data from the raw json response from the
 | 
			
		||||
    ShoppingApi call and provided a bare minimum format for the dataframe to be
 | 
			
		||||
    used in training, PreProcessing optimizes that dataframe for training and
 | 
			
		||||
    includes methods for image manipulation, creating test/train/validation
 | 
			
		||||
    splits, etc.
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    def dict_pics(self):
 | 
			
		||||
        '''
 | 
			
		||||
        Source to target training. Replaces source image URL with target URL
 | 
			
		||||
        determined by values in dict_pics variable.
 | 
			
		||||
        '''
 | 
			
		||||
        
 | 
			
		||||
        target_dir = os.getcwd()
 | 
			
		||||
        with open('temp_pics_source_list.txt') as f:
 | 
			
		||||
            temp_pics_source_list = json.load(f)
 | 
			
		||||
        dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
 | 
			
		||||
        print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images')
 | 
			
		||||
        return dict_pics
 | 
			
		||||
 | 
			
		||||
        # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
 | 
			
		||||
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
 | 
			
		||||
        # if not exists and append to master img download dict
 | 
			
		||||
        # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    '''
 | 
			
		||||
    Main program creates/updates a csv file to use for ML training from live
 | 
			
		||||
    ebay listings
 | 
			
		||||
    '''
 | 
			
		||||
    pass
 | 
			
		||||
# main goes here:
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    main()
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
Based on your sample set of 10 images, if you have an average of 5 images per
 | 
			
		||||
listing and you download a hundred listings, you will have about 102 Gb of
 | 
			
		||||
image data. That's just for one day. If you have more than a million listings
 | 
			
		||||
you're looking at a little over 1Tb of image data. You don't even know if this
 | 
			
		||||
is good data yet.
 | 
			
		||||
'''
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
 "cells": [
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 7,
 | 
			
		||||
   "execution_count": 1,
 | 
			
		||||
   "id": "572dc7fb",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -35,7 +35,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 8,
 | 
			
		||||
   "execution_count": 2,
 | 
			
		||||
   "id": "8d94196d",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -68,70 +68,106 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 9,
 | 
			
		||||
   "execution_count": 3,
 | 
			
		||||
   "id": "a5c72863",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# image_faults.faulty_images() # removes faulty images\n",
 | 
			
		||||
    "image_faults.faulty_images() # removes faulty images\n",
 | 
			
		||||
    "df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 11,
 | 
			
		||||
   "execution_count": 4,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def dict_pics_jup():\n",
 | 
			
		||||
    "    '''\n",
 | 
			
		||||
    "    {source:target} dict used to replace source urls with image location as input\n",
 | 
			
		||||
    "    '''\n",
 | 
			
		||||
    "    target_dir = os.getcwd() + os.sep + \"training_images\"\n",
 | 
			
		||||
    "    with open('temp_pics_source_list.txt') as f:\n",
 | 
			
		||||
    "        temp_pics_source_list = json.load(f)\n",
 | 
			
		||||
    "        \n",
 | 
			
		||||
    "    dict_pics = {}\n",
 | 
			
		||||
    "    for k in temp_pics_source_list:\n",
 | 
			
		||||
    "         patt_1 = re.search(r'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))', k, re.IGNORECASE)\n",
 | 
			
		||||
    "         patt_2 = re.search(r'(\\.jpg|\\.jpeg|\\.png)', k, re.IGNORECASE)\n",
 | 
			
		||||
    "         if patt_1 and patt_2 is not None:\n",
 | 
			
		||||
    "             tag = patt_1.group() + patt_2.group().lower()\n",
 | 
			
		||||
    "             file_name = target_dir + os.sep + tag\n",
 | 
			
		||||
    "             dict_pics.update({k:file_name})\n",
 | 
			
		||||
    "    print(\"{source:target} dictionary created @ \" + target_dir)\n",
 | 
			
		||||
    "    return dict_pics\n"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 5,
 | 
			
		||||
   "id": "1057a442",
 | 
			
		||||
   "metadata": {
 | 
			
		||||
    "scrolled": true
 | 
			
		||||
   },
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "ename": "AttributeError",
 | 
			
		||||
     "evalue": "'NoneType' object has no attribute 'group'",
 | 
			
		||||
     "output_type": "error",
 | 
			
		||||
     "traceback": [
 | 
			
		||||
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 | 
			
		||||
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
 | 
			
		||||
      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0mblah\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPictureURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PictureURL'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'"
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "{source:target} dictionary created @ /tf/training_images\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def dict_pics_jup(): # \n",
 | 
			
		||||
    "    target_dir = os.getcwd() + os.sep + \"training_images\"\n",
 | 
			
		||||
    "    with open('temp_pics_source_list.txt') as f:\n",
 | 
			
		||||
    "        temp_pics_source_list = json.load(f)\n",
 | 
			
		||||
    "    dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}\n",
 | 
			
		||||
    "    print(\"{source:target} dictionary created @ \" + target_dir)\n",
 | 
			
		||||
    "    return dict_pics\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "dict_pics = dict_pics_jup()\n",
 | 
			
		||||
    "with open('temp_pics_source_list.txt') as f:\n",
 | 
			
		||||
    "    tempics = json.load(f)\n",
 | 
			
		||||
    "# list of image urls that did not get named properly which will be removed from the dataframe\n",
 | 
			
		||||
    "drop_row_vals = []\n",
 | 
			
		||||
    "for pic in tempics:\n",
 | 
			
		||||
    "    try:\n",
 | 
			
		||||
    "        dict_pics[pic]\n",
 | 
			
		||||
    "    except KeyError:\n",
 | 
			
		||||
    "        drop_row_vals.append(pic)\n",
 | 
			
		||||
    "        \n",
 | 
			
		||||
    "df = df[df.PictureURL.isin(drop_row_vals)==False]\n",
 | 
			
		||||
    "# TODO drop men's or women's categories here\n",
 | 
			
		||||
    "blah = pd.Series(df.PictureURL)\n",
 | 
			
		||||
    "df = df.drop(labels=['PictureURL'], axis=1)\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "blah = blah.apply(lambda x: dict_pics[x])\n",
 | 
			
		||||
    "df = pd.concat([blah, df],axis=1)\n",
 | 
			
		||||
    "df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n",
 | 
			
		||||
    "# removes non-existent image paths"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 5,
 | 
			
		||||
   "id": "7a6146e6",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "df=df.sample(frac=1)"
 | 
			
		||||
    "df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 6,
 | 
			
		||||
   "id": "7a6146e6",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "17"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 6,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "df=df.sample(frac=1)\n",
 | 
			
		||||
    "len(drop_row_vals)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 7,
 | 
			
		||||
   "id": "114cc3c0",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -143,7 +179,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 7,
 | 
			
		||||
   "execution_count": 8,
 | 
			
		||||
   "id": "506aa5cf",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -155,18 +191,10 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 8,
 | 
			
		||||
   "execution_count": 9,
 | 
			
		||||
   "id": "4d72eb90",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Found 5110 validated image filenames belonging to 13 classes.\n",
 | 
			
		||||
      "Found 1277 validated image filenames belonging to 13 classes.\n"
 | 
			
		||||
     ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stderr",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
@@ -174,6 +202,14 @@
 | 
			
		||||
      "/usr/local/lib/python3.8/dist-packages/keras_preprocessing/image/dataframe_iterator.py:279: UserWarning: Found 1 invalid image filename(s) in x_col=\"PictureURL\". These filename(s) will be ignored.\n",
 | 
			
		||||
      "  warnings.warn(\n"
 | 
			
		||||
     ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Found 53005 validated image filenames belonging to 13 classes.\n",
 | 
			
		||||
      "Found 13251 validated image filenames belonging to 13 classes.\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
@@ -212,7 +248,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 9,
 | 
			
		||||
   "execution_count": 10,
 | 
			
		||||
   "id": "7b70f37f",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -222,7 +258,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 10,
 | 
			
		||||
   "execution_count": 11,
 | 
			
		||||
   "id": "1ed54bf5",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -239,7 +275,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 11,
 | 
			
		||||
   "execution_count": 12,
 | 
			
		||||
   "id": "85934565",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -250,7 +286,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 12,
 | 
			
		||||
   "execution_count": 13,
 | 
			
		||||
   "id": "6322bcad",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
@@ -270,7 +306,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 13,
 | 
			
		||||
   "execution_count": 14,
 | 
			
		||||
   "id": "07fd25c6",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -283,7 +319,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 14,
 | 
			
		||||
   "execution_count": 15,
 | 
			
		||||
   "id": "b31af79e",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -294,7 +330,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 15,
 | 
			
		||||
   "execution_count": 16,
 | 
			
		||||
   "id": "fe06f2bf",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
@@ -728,7 +764,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 16,
 | 
			
		||||
   "execution_count": 17,
 | 
			
		||||
   "id": "ea620129",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -741,7 +777,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 17,
 | 
			
		||||
   "execution_count": 18,
 | 
			
		||||
   "id": "fd5d1246",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
@@ -753,7 +789,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 18,
 | 
			
		||||
   "execution_count": 19,
 | 
			
		||||
   "id": "9cd2ba27",
 | 
			
		||||
   "metadata": {
 | 
			
		||||
    "scrolled": false
 | 
			
		||||
@@ -764,35 +800,13 @@
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Epoch 1/30\n",
 | 
			
		||||
      "80/80 [==============================] - 78s 913ms/step - loss: 1.8419 - accuracy: 0.4153 - val_loss: 2.6144 - val_accuracy: 0.0720\n",
 | 
			
		||||
      "829/829 [==============================] - 786s 942ms/step - loss: 1.5037 - accuracy: 0.4896 - val_loss: 1.2946 - val_accuracy: 0.5520\n",
 | 
			
		||||
      "Epoch 2/30\n",
 | 
			
		||||
      "80/80 [==============================] - 69s 862ms/step - loss: 0.9255 - accuracy: 0.7121 - val_loss: 2.5804 - val_accuracy: 0.1026\n",
 | 
			
		||||
      "829/829 [==============================] - 726s 875ms/step - loss: 0.8550 - accuracy: 0.7117 - val_loss: 1.3593 - val_accuracy: 0.5593\n",
 | 
			
		||||
      "Epoch 3/30\n",
 | 
			
		||||
      "80/80 [==============================] - 72s 900ms/step - loss: 0.4003 - accuracy: 0.9092 - val_loss: 2.4443 - val_accuracy: 0.2310\n",
 | 
			
		||||
      "829/829 [==============================] - 750s 905ms/step - loss: 0.3322 - accuracy: 0.8993 - val_loss: 1.5304 - val_accuracy: 0.5542\n",
 | 
			
		||||
      "Epoch 4/30\n",
 | 
			
		||||
      "80/80 [==============================] - 71s 888ms/step - loss: 0.1224 - accuracy: 0.9847 - val_loss: 2.1299 - val_accuracy: 0.3782\n",
 | 
			
		||||
      "Epoch 5/30\n",
 | 
			
		||||
      "80/80 [==============================] - 75s 935ms/step - loss: 0.0371 - accuracy: 0.9975 - val_loss: 1.7368 - val_accuracy: 0.4973\n",
 | 
			
		||||
      "Epoch 6/30\n",
 | 
			
		||||
      "80/80 [==============================] - 73s 900ms/step - loss: 0.0167 - accuracy: 0.9992 - val_loss: 1.6747 - val_accuracy: 0.5341\n",
 | 
			
		||||
      "Epoch 7/30\n",
 | 
			
		||||
      "80/80 [==============================] - 72s 897ms/step - loss: 0.0097 - accuracy: 0.9998 - val_loss: 1.6494 - val_accuracy: 0.5442\n",
 | 
			
		||||
      "Epoch 8/30\n",
 | 
			
		||||
      "80/80 [==============================] - 74s 915ms/step - loss: 0.0062 - accuracy: 0.9998 - val_loss: 1.6659 - val_accuracy: 0.5568\n",
 | 
			
		||||
      "Epoch 9/30\n",
 | 
			
		||||
      "80/80 [==============================] - 74s 917ms/step - loss: 0.0044 - accuracy: 1.0000 - val_loss: 1.7088 - val_accuracy: 0.5615\n",
 | 
			
		||||
      "Epoch 10/30\n",
 | 
			
		||||
      "80/80 [==============================] - 70s 868ms/step - loss: 0.0035 - accuracy: 1.0000 - val_loss: 1.7540 - val_accuracy: 0.5583\n",
 | 
			
		||||
      "Epoch 11/30\n",
 | 
			
		||||
      "80/80 [==============================] - 70s 864ms/step - loss: 0.0027 - accuracy: 0.9998 - val_loss: 1.7894 - val_accuracy: 0.5552\n",
 | 
			
		||||
      "Epoch 12/30\n",
 | 
			
		||||
      "80/80 [==============================] - 69s 858ms/step - loss: 0.0020 - accuracy: 1.0000 - val_loss: 1.8126 - val_accuracy: 0.5536\n",
 | 
			
		||||
      "Epoch 13/30\n",
 | 
			
		||||
      "80/80 [==============================] - 69s 857ms/step - loss: 0.0019 - accuracy: 1.0000 - val_loss: 1.8496 - val_accuracy: 0.5544\n",
 | 
			
		||||
      "Epoch 14/30\n",
 | 
			
		||||
      "80/80 [==============================] - 69s 859ms/step - loss: 0.0015 - accuracy: 1.0000 - val_loss: 1.8646 - val_accuracy: 0.5544\n",
 | 
			
		||||
      "Epoch 15/30\n",
 | 
			
		||||
      "30/80 [==========>...................] - ETA: 36s - loss: 0.0011 - accuracy: 1.0000"
 | 
			
		||||
      "172/829 [=====>........................] - ETA: 7:57 - loss: 0.1030 - accuracy: 0.9787"
 | 
			
		||||
     ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
@@ -802,7 +816,7 @@
 | 
			
		||||
     "traceback": [
 | 
			
		||||
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 | 
			
		||||
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 | 
			
		||||
      "\u001b[0;32m<ipython-input-18-4cd4443bbf2a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m model.fit(x=train_generator,\n\u001b[0m\u001b[1;32m      2\u001b[0m           \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m           \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m           \u001b[0mvalidation_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m           \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m<ipython-input-19-4cd4443bbf2a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m model.fit(x=train_generator,\n\u001b[0m\u001b[1;32m      2\u001b[0m           \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m           \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m           \u001b[0mvalidation_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalidation_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m           \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py\u001b[0m in \u001b[0;36merror_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     62\u001b[0m     \u001b[0mfiltered_tb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     63\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m       \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     65\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# pylint: disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     66\u001b[0m       \u001b[0mfiltered_tb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_process_traceback_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[1;32m   1219\u001b[0m               \u001b[0mlogs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtmp_logs\u001b[0m  \u001b[0;31m# No error, now safe to assign to logs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1220\u001b[0m               \u001b[0mend_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdata_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep_increment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1221\u001b[0;31m               \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_train_batch_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mend_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1222\u001b[0m               \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstop_training\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1223\u001b[0m                 \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
      "\u001b[0;32m/usr/local/lib/python3.8/dist-packages/keras/callbacks.py\u001b[0m in \u001b[0;36mon_train_batch_end\u001b[0;34m(self, batch, logs)\u001b[0m\n\u001b[1;32m    434\u001b[0m     \"\"\"\n\u001b[1;32m    435\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_should_call_train_batch_hooks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 436\u001b[0;31m       \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_batch_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mModeKeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'end'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    437\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    438\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0mon_test_batch_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										81
									
								
								ebay_api.py
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								ebay_api.py
									
									
									
									
									
								
							@@ -26,9 +26,9 @@ from ebaysdk.shopping import Connection as Shopping
 | 
			
		||||
# renew oauth token for shopping api
 | 
			
		||||
def getAuthToken():
 | 
			
		||||
     AppSettings = {
 | 
			
		||||
          'client_id': cfg.oauth.client_id,
 | 
			
		||||
          'client_secret':cfg.oauth.client_secret,
 | 
			
		||||
          'ruName':cfg.oauth.RuName
 | 
			
		||||
          'client_id': cfg.oauth["client_id"],
 | 
			
		||||
          'client_secret':cfg.oauth["client_secret"],
 | 
			
		||||
          'ruName':cfg.oauth["RuName"]
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
     authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret']
 | 
			
		||||
@@ -50,14 +50,14 @@ def getAuthToken():
 | 
			
		||||
 | 
			
		||||
     tokenURL = "https://api.ebay.com/identity/v1/oauth2/token"
 | 
			
		||||
 | 
			
		||||
     response = requests.post(tokenURL, headers=headers, data=data) 
 | 
			
		||||
     error = response['error_description'] #if errors
 | 
			
		||||
     access_token = response.json()['access_token']
 | 
			
		||||
     response = requests.post(tokenURL, headers=headers, data=data).json()
 | 
			
		||||
#     error = response['error_description'] #if errors
 | 
			
		||||
     access_token = response['access_token']
 | 
			
		||||
 | 
			
		||||
     with open('temp_oath_token.txt', 'w') as f:
 | 
			
		||||
     with open('temp_oauth_token.txt', 'w') as f:
 | 
			
		||||
         json.dump(access_token, f)
 | 
			
		||||
 | 
			
		||||
     return access_token, error
 | 
			
		||||
     return access_token
 | 
			
		||||
 | 
			
		||||
class FindingApi:
 | 
			
		||||
    '''
 | 
			
		||||
@@ -135,27 +135,32 @@ class FindingApi:
 | 
			
		||||
        call
 | 
			
		||||
        '''
 | 
			
		||||
 | 
			
		||||
        itemid_results_list = []
 | 
			
		||||
        ids = []
 | 
			
		||||
 | 
			
		||||
        # load category id list
 | 
			
		||||
        with open('cat_list.txt') as jf:
 | 
			
		||||
            cat_list = json.load(jf)
 | 
			
		||||
 | 
			
		||||
        # load list of master ids
 | 
			
		||||
        with open('master_ids.txt') as f:
 | 
			
		||||
            master_ids = json.load(f)
 | 
			
		||||
 | 
			
		||||
        # fetch ids with calls to Finding Api given cats as param
 | 
			
		||||
        with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
            for future in executor.map(self.get_data, cat_list):
 | 
			
		||||
                itemid_results_list.extend(future)
 | 
			
		||||
                ids.extend(future)
 | 
			
		||||
 | 
			
		||||
            print(len(itemid_results_list))
 | 
			
		||||
            a = list(set(itemid_results_list))
 | 
			
		||||
            print(len(a))
 | 
			
		||||
        # append master ids list with temporary ids from single function call and save
 | 
			
		||||
        master_ids.extend(ids)
 | 
			
		||||
        master_ids = list(set(master_ids))
 | 
			
		||||
        with open('master_ids.txt', 'w') as f:
 | 
			
		||||
            json.dump(master_ids, f)
 | 
			
		||||
 | 
			
		||||
        with open('raw_ids.txt', 'w') as f:
 | 
			
		||||
            json.dump(itemid_results_list, f)
 | 
			
		||||
 # 20-ItemID list created to maximize dataset/decrease calls provided call constraints
 | 
			
		||||
        twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0,
 | 
			
		||||
            len(ids), 20))]
 | 
			
		||||
 | 
			
		||||
 # 20-ItemID list created to maximize dataset/decrease calls given call constraints
 | 
			
		||||
        item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
 | 
			
		||||
            len(itemid_results_list), 20))]
 | 
			
		||||
 | 
			
		||||
        return item_id_results, itemid_results_list
 | 
			
		||||
        return twenty_id_list, ids
 | 
			
		||||
 | 
			
		||||
class ShoppingApi:
 | 
			
		||||
    '''
 | 
			
		||||
@@ -163,11 +168,13 @@ class ShoppingApi:
 | 
			
		||||
    pandas dataframes
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        
 | 
			
		||||
        # renew oauth token
 | 
			
		||||
        access_token = getAuthToken()[0]
 | 
			
		||||
        self.access_token = access_token
 | 
			
		||||
#    def __init__(self):
 | 
			
		||||
#        
 | 
			
		||||
#        # renew oauth token
 | 
			
		||||
#        oauth_response = getAuthToken()
 | 
			
		||||
#        access_token = oauth_response[0]
 | 
			
		||||
#        
 | 
			
		||||
#        self.access_token = access_token
 | 
			
		||||
 | 
			
		||||
    def update_cats(self):
 | 
			
		||||
        '''
 | 
			
		||||
@@ -176,11 +183,15 @@ class ShoppingApi:
 | 
			
		||||
 | 
			
		||||
        parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
 | 
			
		||||
        cat_list = []
 | 
			
		||||
        # TODO make sep lists for women's and men's shoe cats. Needed to train
 | 
			
		||||
        # mens and women's cats separately. This might improve val. acc. during training
 | 
			
		||||
 | 
			
		||||
        with open('temp_oauth_token.txt') as f:
 | 
			
		||||
            access_token = json.load(f)
 | 
			
		||||
        for department in parent_cats:
 | 
			
		||||
 | 
			
		||||
            headers = {
 | 
			
		||||
                "X-EBAY-API-IAF-TOKEN":self.access_token,
 | 
			
		||||
                "X-EBAY-API-IAF-TOKEN":access_token,
 | 
			
		||||
                "version":"671",
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
@@ -197,19 +208,31 @@ class ShoppingApi:
 | 
			
		||||
            response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept.
 | 
			
		||||
 | 
			
		||||
            temp_cat_list = [cat['CategoryID'] for cat in response]
 | 
			
		||||
 | 
			
		||||
            if department == '3034':
 | 
			
		||||
                women_cats = temp_cat_list
 | 
			
		||||
            elif department == '93427':
 | 
			
		||||
                men_cats = temp_cat_list
 | 
			
		||||
 | 
			
		||||
            cat_list.extend(temp_cat_list)
 | 
			
		||||
 | 
			
		||||
            with open('cat_list.txt', 'w') as f:
 | 
			
		||||
                json.dump(cat_list, f)
 | 
			
		||||
        with open('cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(cat_list, f)
 | 
			
		||||
        with open('women_cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(women_cats, f)
 | 
			
		||||
        with open('men_cat_list.txt', 'w') as f:
 | 
			
		||||
            json.dump(men_cats, f)
 | 
			
		||||
 | 
			
		||||
    def get_item_from_findItemsByCategory(self, twenty_id):
 | 
			
		||||
 | 
			
		||||
        '''
 | 
			
		||||
       Gets raw JSON data from multiple live listings given multiple itemIds
 | 
			
		||||
        '''
 | 
			
		||||
        with open('temp_oauth_token.txt') as f:
 | 
			
		||||
            access_token = json.load(f)
 | 
			
		||||
 | 
			
		||||
        headers = {
 | 
			
		||||
            "X-EBAY-API-IAF-TOKEN":self.access_token,
 | 
			
		||||
            "X-EBAY-API-IAF-TOKEN":access_token,
 | 
			
		||||
            "version":"671",
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -17,7 +17,7 @@ def faulty_images():
 | 
			
		||||
            img = PIL.Image.open(img_p)
 | 
			
		||||
        except PIL.UnidentifiedImageError:
 | 
			
		||||
                os.remove(img_p)
 | 
			
		||||
                print(img_p + "Removed")
 | 
			
		||||
#                print(img_p + "Removed")
 | 
			
		||||
#             remove from folder, dataset(is constructed from the csv files
 | 
			
		||||
#             ), dict_pics, temp_pics_source_list,
 | 
			
		||||
#             expanded_dropd, expanded_class. But, remember that if you run curate.py
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +0,0 @@
 | 
			
		||||
'''
 | 
			
		||||
Initial download and write of raw data from ebay
 | 
			
		||||
'''
 | 
			
		||||
import ebay_api
 | 
			
		||||
 | 
			
		||||
shopping = ebay_api.ShoppingApi()
 | 
			
		||||
data = shopping.conky()
 | 
			
		||||
							
								
								
									
										13
									
								
								update_dataset.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								update_dataset.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
			
		||||
'''
 | 
			
		||||
Update dataset; instantiates FindingApi and makes call to eBay's Finding Api
 | 
			
		||||
using the findItemsByCategory service. Updates the master_ids list and raw_data.
 | 
			
		||||
'''
 | 
			
		||||
import ebay_api
 | 
			
		||||
 | 
			
		||||
# Make call to ebay Finding service and return list of twenty_id strings
 | 
			
		||||
finding = ebay_api.FindingApi(4) # 4 is URL paramter for used items
 | 
			
		||||
twenty_id_list = finding.get_ids_from_cats()[0]
 | 
			
		||||
 | 
			
		||||
# renew oauth token and make call to shopping service to get item data and write to local file
 | 
			
		||||
shopping = ebay_api.ShoppingApi()
 | 
			
		||||
data = shopping.conky(twenty_id_list)
 | 
			
		||||
		Reference in New Issue
	
	Block a user