ebay-ml-lister/:w

import os
from time import sleep
from random import randint
import scrape_ids
from datetime import datetime, timedelta
import dateutil
from dateutil import parser
import pytz
import pdb
from io import StringIO
import numpy as np
import concurrent.futures
import json
import requests
import pandas as pd
import config as cfg
import shutil
import re
import urllib, base64

from ebaysdk.exception import ConnectionError
from ebaysdk.trading import Connection as Trading
from ebaysdk.finding import Connection as Finding
from ebaysdk.shopping import Connection as Shopping

# renew oauth token for shopping api
def getAuthToken():
     AppSettings = {
          'client_id': cfg.oauth["client_id"],
          'client_secret':cfg.oauth["client_secret"],
          'ruName':cfg.oauth["RuName"]
          }

     authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret']
     encodedAuthHeader = base64.b64encode(str.encode(authHeaderData))
     encodedAuthHeader = str(encodedAuthHeader)[2:len(str(encodedAuthHeader))-1]

     headers = {
          "Content-Type" : "application/x-www-form-urlencoded", # what is this?
          "Authorization" : "Basic " + str(encodedAuthHeader)
          }

     body= {
          "grant_type" : "client_credentials",
          "redirect_uri" : AppSettings['ruName'],
          "scope" : "https://api.ebay.com/oauth/api_scope"
      }

     data = urllib.parse.urlencode(body)

     tokenURL = "https://api.ebay.com/identity/v1/oauth2/token"

     response = requests.post(tokenURL, headers=headers, data=data).json()
#     error = response['error_description'] #if errors
     access_token = response['access_token']

     with open('temp_oauth_token.txt', 'w') as f:
         json.dump(access_token, f)

     return access_token

class FindingApi:
    '''
    Methods for accessing eBay's FindingApi services
    '''

    def __init__(self, service):
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
            'findItemsByProduct'
            ][service] # Currently using only index 4, i.e., service = 4

    def get_data(self, category_id):

        '''
        Gets raw JSON data fom FindingApi service call. Currently being used to
        get itemIDs from categories;
        '''
#        startTime = dateutil.parser.isoparse( startTime )
#        now = datetime.datetime.now(tz=pytz.UTC)
#        days_on_site = (now - startTime).days # as int

        ids = []
        params = {
            "OPERATION-NAME":self.service,
            "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
            "SERVICE-VERSION":"1.13.0",
            "RESPONSE-DATA-FORMAT":"JSON",
            "categoryId":category_id,
            "paginationInput.entriesPerPage":"100",
            "paginationInput.PageNumber":"1",
            "itemFilter(0).name":"Condition",
            "itemFilter(0).value":"Used",
            "itemFilter.name":"HideDuplicateItems",
            "itemFilter.value":"true",
            "sortOrder":"StartTimeNewest",
            }
#            "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here
#            "itemFilter(1).value":"true"

        try:
            response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
                params=params, timeout=24)
            response.raise_for_status()

        except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
            print('connection error')
            return ids
        try:
            data = response.json()
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                ids.append(item['itemId'][0])

            ids = list(set(ids))

        except (AttributeError, KeyError):
            print('AttributeError or KeyError. Exiting')
            print(response.json())
            return ids

        return ids

# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe 
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
# it to the first 5 pictures instead of random. 

# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
# to see which one is more accurate or if a combo of both is more accurate. 

    def get_ids_from_cats(self):
        '''
        Creates a 20-itemId list to use for the ShoppingApi
        call
        '''

        ids = []

        # load category id list
        with open('cat_list.txt') as jf:
            cat_list = json.load(jf)

        # load list of master ids
        with open('master_ids.txt') as f:
            master_ids = json.load(f)

        # fetch ids with calls to Finding Api given cats as param
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_data, cat_list):
                ids.extend(future)

        # append master ids list with temporary ids from single function call and save
        master_ids.extend(ids)
        master_ids = list(set(master_ids))
        with open('master_ids.txt', 'w') as f:
            json.dump(master_ids, f)

 # 20-ItemID list created to maximize dataset/decrease calls provided call constraints
        twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0,
            len(ids), 20))]

        return twenty_id_list, ids

class ShoppingApi:
    '''
    Creates objects from ShoppingApi service calls that can interact with
    pandas dataframes
    '''

#    def __init__(self):
#        
#        # renew oauth token
#        oauth_response = getAuthToken()
#        access_token = oauth_response[0]
#        
#        self.access_token = access_token

    def update_cats(self):
        '''
        Updates cat_list.txt
        '''

        parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
        cat_list = []
        # TODO make sep lists for women's and men's shoe cats. Needed to train
        # mens and women's cats separately. This might improve val. acc. during training

        with open('temp_oauth_token.txt') as f:
            access_token = json.load(f)
        for department in parent_cats:

            headers = {
                "X-EBAY-API-IAF-TOKEN":access_token,
                "version":"671",
                }

            url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department

            try:
                response = requests.get(url, headers=headers, timeout=4)
                response.raise_for_status()

            except requests.exceptions.RequestException:
                print('connection error')

            response = response.json()
            parent_cat = response['CategoryArray']['Category'][0] 
            response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept.

            temp_cat_list = [cat['CategoryID'] for cat in response]
            if parent_cat == '3034':
                women_cats = temp_cat_list
            elif parent_cat = '93427':
                men_cats = temp_cat_list
            cat_list.extend(temp_cat_list)

        with open('cat_list.txt', 'w') as f:
            json.dump(cat_list, f)
        with open('women_cat_list.txt', 'w') as f:
            json.dump(women_cats, f)
        with open('men_cat_list.txt', 'w') as f:
            json.dump(men_cats, f)

    def get_item_from_findItemsByCategory(self, twenty_id):

        '''
       Gets raw JSON data from multiple live listings given multiple itemIds
        '''
        with open('temp_oauth_token.txt') as f:
            access_token = json.load(f)

        headers = {
            "X-EBAY-API-IAF-TOKEN":access_token,
            "version":"671",
            }

        url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id

        try:
        
            response = requests.get(url, headers=headers,timeout=24)
            response.raise_for_status()
            response = response.json()
            item = response['Item']


        except (requests.exceptions.RequestException, KeyError):
            print('connection error. IP limit possibly exceeded')
            print(response)
            return # returns NoneType. Handled at conky()

        return item

    def conky(self, twenty_ids_list):
        '''
        Runs get_item_from_findItemsByCategory in multiple threads to get relevant
        data for creating training sets
        '''
        try:
            with open('raw_data.txt') as f:
                data = json.load(f)
        except (FileNotFoundError, ValueError):
            data = []

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list):
                if future is not None:
                    for item in future:
                        data.append(item) # The end result should be a list of dicts where each dict in the list is a listing 
                else:
                    print('response is None')
                    break
        with open('raw_data.txt', 'w') as f:
            json.dump(data, f)
        return data

# NOTE:

# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.


class CurateData:
    '''
    Contains methods for curating data for machine learning training sets;
    Takes item in data from ShoppingApi request as argument and extracts/ creates key
    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''

    def import_raw(self):
        '''
        imports raw response json from local file. This is data from
        GetMultipleItems call in ShoppingApi
        '''
        with open('raw_data.txt') as f:
            raw_data = json.load(f)
            return raw_data

    def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
        '''
        creates pandas df from raw json and saves master raw csv file as raw_df.csv.
        Indended to be used inline with direct
        data stream from ebay's APIs
        '''
        to_json = json.dumps(raw_data)
        raw_df = pd.read_json(StringIO(to_json))
        raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
        #raw_df = pd.read_csv('raw_df.csv', index_col=0)
        #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
        #raw_df.to_csv('raw_df.csv')

        # TODO still saving "Unnamed:0" column

        return raw_df

    def to_training(self, raw_data):
        '''
        creates first pass of potential labels for training set. This is the base
        df used to produce other training sets to use.
        '''
        raw_df = self.raw_df(raw_data)
        interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
        training = interm_df1.dropna(subset=['ItemSpecifics'])
        return training # TODO RENAME THIS FUNC AND its RETURN VALUE

    def class_training(self, training):
        '''Training set for multiclass portion of training set. Used to train
        seprately from multilabel portion
        '''
        class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
        return class_training

    def nvl_training(self, training):
        '''
        Training set for multilabel portion
        '''
        interm_df1 = pd.Series(training.ItemSpecifics)
        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])

        # Necessary for json_normalize(): 

        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
        nvl_df = pd.json_normalize(nvl_dict)
        nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)

        return nvl_training

    def extract_df(self, df):
        '''
        converts single-value lists of strings of any df to string if not null
        '''
        extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x)

        return extracted_df

    def drop_nvl_cols(self, nvl_training):

        with open('cat_spacs.txt') as f:
            cat_spacs = json.load(f)

        drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', 
                'Personalize', 'Fabric Type', 'Customized','Release Year',
                'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade',
                'Signed', 'Silhouette', 'Insole Material', 'Lining Material',
                'California Prop 65 Warning', 'Character Family', 'Character',
                'Cushioning Level', 'Personalization Instructions', 'Pronation',
                ]
        drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features',
                'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', 
                'Country/Region of Manufacture', 'Brand', 'Model']
        for cat in drop :
            if cat in cat_spacs:
                cat_spacs.remove(cat)
        for cat in drop_2:
            if cat in cat_spacs:
                cat_spacs.remove(cat)

        user_input = input('drop cols? (y,n; default=y): ')

        if 'n' in user_input:
            dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
        else:
            cols = []
            for col in cat_spacs:
                if col in list(nvl_training.columns):
                    cols.append(col)
            cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training
            dropd = nvl_training[cols]

        return dropd

# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 

# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)

    def expand_nvlclass(self, class_training, dropd):
        '''
        takes image url list from each cell and expands them into separate/duplicate
        instances. Modifies both class training and dropd dfs. Appends custom
        image url dict {'source':'target'}.
        * consider applying this function to other cells that have multiple values in their lists
        '''
        expand = input("expand image list or use primary listing image? (y or n): ")
        if ('y' or 'Y') in expand:
            expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
            expanded_class = expanded_class.dropna(subset=['PictureURL'])
            expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)

            expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
            expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
            expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)

            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) 

        else:
            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
            expanded_class = class_training.dropna()
            dropd = dropd.dropna(subset=['PictureURL'])
            dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
            dropd = dropd.dropna(subset=['PictureURL'])
            expanded_dropd = dropd

            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

            # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))

        try:
            with open('temp_pics_source_list.txt') as f:
                tpsl = json.load(f)
                tpsl.extend(temp_pics_source_list)

                # ensures no duplicate source URLs exist
                temp_pics_source_list = list(set(tpsl))
                with open('temp_pics_source_list.txt', 'w') as f:
                    json.dump(temp_pics_source_list, f)

        # creates file if script is ran for 1st time and file not present
        except (ValueError, FileNotFoundError):
            with open('temp_pics_source_list.txt', 'w') as f:
                json.dump(temp_pics_source_list, f)

        # Append to master training dataframes, drop potential dupes and save
        expanded_class.to_csv('expanded_class.csv')
        expanded_dropd.to_csv('expanded_dropd.csv')

        return expanded_class, expanded_dropd

    def dl_pic(self,dict_pics, pic):

        try:

            # check if image exists in current working directory. avoids dupes
            if os.path.exists(dict_pics[pic]):
                pass

            else:

                try:

                    r = requests.get(pic, stream=True)
                    r.raw.decode_content = True
                    with open(dict_pics[pic], 'wb') as  f:
                        shutil.copyfileobj(r.raw, f)

                except ConnectionError:
                    return

        except KeyError:
            pass

    def dict_pics(self):

        try:
            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
                target_dir = json.load(f)

        except (ValueError, FileNotFoundError):
            target_dir = input('No target dirctory found. Create One? [y] or [n]:')
            if target_dir == ('y' or 'Y'):
                target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
                with open('target_dirs.txt','w') as f:
                    json.dump(target_dir, f)
                
            else:
                os.mkdir(os.getcwd()+os.sep+'training_images')
                target_dir = os.getcwd()+os.sep+'training_images'
                with open('target_dirs.txt','w') as f:
                    json.dump(target_dir, f)
                    print('Creating default folder in current directory @ ' + target_dir)

        # open url list in working directory
        with open('temp_pics_source_list.txt') as f:
            
            try:
                temp_pics_source_list = json.load(f)

            except (ValueError, FileNotFoundError):
                print('url list not found. aborting')
                return

        dict_pics = {}

        # make custom dict, {source:target}, and name images from unique URL patt
        for k in temp_pics_source_list:
            patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE)
            patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE) 
            if patt_1 and patt_2 is not None:
                tag = patt_1.group() + patt_2.group().lower()
                file_name = target_dir + os.sep + tag
                dict_pics.update({k:file_name})

        with open('dict_pics.txt', 'w') as f:
            json.dump(dict_pics, f)

        return dict_pics # TODO still need to find sol to outliers (aka, naming scheme for unusual source URLs)

    def dl_pictures(self, *dict_pics):
        '''
        Downloads pictures from api to local storage using temp_pics_source_list
        and dict_pics
        '''

        if not dict_pics:
            dict_pics = self.dict_pics()

        with open('temp_pics_source_list.txt') as f:
            try:
                temp_pics_source_list = json.load(f)
            except (ValueError, FileNotFoundError):
                print('url list not found. download aborted')
                return

        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(lambda p: self.dl_pic(*p), bargs):
                if future is not None:
                    future
                else:
                    print('connection error')

class PreProcessing:
    '''
    Includes methods for pre-processing training set input and labels in the
    training set created from CurateData class. Whereas CurateData training
    sets provided trimmed down data from the raw json response from the
    ShoppingApi call and provided a bare minimum format for the dataframe to be
    used in training, PreProcessing optimizes that dataframe for training and
    includes methods for image manipulation, creating test/train/validation
    splits, etc.
    '''

    def dict_pics(self):
        '''
        Source to target training. Replaces source image URL with target URL
        determined by values in dict_pics variable.
        '''
        
        target_dir = os.getcwd()
        with open('temp_pics_source_list.txt') as f:
            temp_pics_source_list = json.load(f)
        dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
        print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images')
        return dict_pics

        # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
        # if not exists and append to master img download dict
        # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted

def main():
    '''
    Main program creates/updates a csv file to use for ML training from live
    ebay listings
    '''
    pass
# main goes here:

if __name__ == "__main__":
    main()

'''
Based on your sample set of 10 images, if you have an average of 5 images per
listing and you download a hundred listings, you will have about 102 Gb of
image data. That's just for one day. If you have more than a million listings
you're looking at a little over 1Tb of image data. You don't even know if this
is good data yet.
'''