ebay-ml-lister/ebay_api.py

import importlib
import numpy as np
import concurrent.futures
import json
import requests
import pandas as pd

class FindingApi:
    '''Methods for accessing eBays FindingApi services'''
    def __init__(self, service, pageNumber):
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
            'findItemsByProduct'
            ][service]
        self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended
        # as this will give equal weights to cats given call constraints

    # departments = ["3034","93427"] (womens and mens)

    def get_data(self, category_id, i):

        '''
        Gets raw JSON data fom FindingApi service call
        Currently being used to get itemIDs from categories
        '''

        params = {
            "OPERATION-NAME":self.service,
            "SECURITY-APPNAME":"scottbea-xlister-PRD-6796e0ff6-14862949",
            "SERVICE-VERSION":"1.13.0",
            "RESPONSE-DATA-FORMAT":"JSON",
            "categoryId":category_id,
            "paginationInput.entriesPerPage":"100",
            "paginationInput.PageNumber":i
            }

        response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
            params=params)

        data = response.json()
        return data


    def get_ids_from_cats(self):
        '''
        Creates a 20-itemId list to use for the ShoppingApi
        call
        '''
        pages = self.pageNumber
        itemid_results_list = []

        with open('cat_list.txt') as jf:
            cat_list = json.load(jf)

        for category_id in cat_list:

            args = [(category_id, i) for i in pages]

            with concurrent.futures.ThreadPoolExecutor() as executor:
                for future in executor.map(lambda p: self.get_data(*p), args):
                    data = future

                    try: # TODO if conditions are not working due to each thread checking the same unedited item_id_results list
                        training = pd.read_csv('training.csv')
                        for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                            if (item not in training.values) and (item not in itemid_results_list):
                                itemid_results_list.append(item['itemId'][0])

                    except (pd.errors.EmptyDataError, FileNotFoundError):
                        for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                            if item not in itemid_results_list:
                                itemid_results_list.append(item['itemId'][0])

        item_id_results = list(set(itemid_results_list))
        item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
            len(itemid_results_list), 20))]
        return item_id_results

# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
#        for category_id in cat_list:

class ShoppingApi:
    '''
    Creates objects from ShoppingApi service calls that can interact with
    pandas dataframes
    '''
    def get_item_from_findItemsByCategory(self, twenty_id):
        '''
        Gets raw JSON data from multiple live listings given multiple itemIds
        '''
        params = {
            "callname":"GetMultipleItems",
            "appid":"scottbea-xlister-PRD-6796e0ff6-14862949",
            "version":"671",
            "responseencoding":"JSON",
            "ItemID":twenty_id,
            "IncludeSelector":"ItemSpecifics",
            }

        response = requests.get("https://open.api.ebay.com/shopping?", params=params)
        response = response.json()
        response = response['Item']
        return response

    def conky(self):
        '''
        For some reason item_id_results can only be passed as argument in executor.map
        if the variable is made within function
        '''
        data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
        finding = FindingApi(4, 2)
        item_id_results = finding.get_ids_from_cats()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
                #  print(future)
                for item in future:
                        data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
                # data.update(future)
        return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list. 
    # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. 
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. 
class CurateData:
    '''
    Contains functions for curating data for machine learning training sets;
    Takes item in data from ShoppingApi request as argument and extracts/ creates key
    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''

    def import_raw(self):
        with open('raw_data.txt') as f:
            raw_data = json.load(f)
            return raw_data

    def data_frame(self, data):
        to_json = json.dumps(data)
        raw_df = pd.read_json(to_json)
        return raw_df

    def to_training(self, data):
        raw_df = self.data_frame(data)
        interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str)
        training = interm_df1

        return training

    def nvl_dict(self, training):
        interm_df1 = pd.Series(training.ItemSpecifics)
        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
        return nvl_dict

    def update_df(self, data):
        '''
        Creates training instances for dataset. picture_url_list expanded to
        max available pictures with each picture url corresponding to features
        in common with same listing (i.e., because there are multiple pictures
        per listing, each picture will be its own training instance.
        '''

    # USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell 
    # USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
    # USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though. 

    # USE pd.merge() at some point...possibly after expanding lists and nvl
    # USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
    # df. then you can take the raw_df and combine it with the old raw_df for backup. 

    # TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
    # for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and 
    # the other for updating your training set. 


def main():
    '''
    Main program creates/updates a csv file to use for ML training from live
    ebay listings
    '''
    pass
# main goes here:

if __name__ == "__main__":
    main()

# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.

# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 

# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
attempting curate by starting with pandas df first 2021-01-23 06:21:56 +00:00			`import importlib`
			`import numpy as np`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`import concurrent.futures`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`import json`
			`import requests`
			`import pandas as pd`

			`class FindingApi:`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`'''Methods for accessing eBays FindingApi services'''`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`def __init__(self, service, pageNumber):`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`self.service = [`
			`'findItemsAdvanced', 'findCompletedItems',`
			`'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',`
			`'findItemsByProduct'`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`][service]`
			`self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`# as this will give equal weights to cats given call constraints`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
			`# departments = ["3034","93427"] (womens and mens)`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`def get_data(self, category_id, i):`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`Gets raw JSON data fom FindingApi service call`
			`Currently being used to get itemIDs from categories`
			`'''`

			`params = {`
			`"OPERATION-NAME":self.service,`
			`"SECURITY-APPNAME":"scottbea-xlister-PRD-6796e0ff6-14862949",`
			`"SERVICE-VERSION":"1.13.0",`
			`"RESPONSE-DATA-FORMAT":"JSON",`
			`"categoryId":category_id,`
			`"paginationInput.entriesPerPage":"100",`
			`"paginationInput.PageNumber":i`
			`}`
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",`
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00			`params=params)`

added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`data = response.json()`
			`return data`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`def get_ids_from_cats(self):`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`'''`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`Creates a 20-itemId list to use for the ShoppingApi`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`call`
			`'''`
wtf 2020-12-13 01:56:51 +00:00			`pages = self.pageNumber`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`itemid_results_list = []`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00			`with open('cat_list.txt') as jf:`
			`cat_list = json.load(jf)`

added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`for category_id in cat_list:`
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00
			`args = [(category_id, i) for i in pages]`

added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`with concurrent.futures.ThreadPoolExecutor() as executor:`
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00			`for future in executor.map(lambda p: self.get_data(*p), args):`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`data = future`

need to fix data.update in conky to be list of dicts not dict of dicts 2020-12-29 07:20:55 +00:00			`try: # TODO if conditions are not working due to each thread checking the same unedited item_id_results list`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`training = pd.read_csv('training.csv')`
			`for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:`
			`if (item not in training.values) and (item not in itemid_results_list):`
			`itemid_results_list.append(item['itemId'][0])`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`except (pd.errors.EmptyDataError, FileNotFoundError):`
			`for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:`
			`if item not in itemid_results_list:`
			`itemid_results_list.append(item['itemId'][0])`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
fixed repeats and multithreading. Needs refining 2020-12-28 04:13:12 +00:00			`item_id_results = list(set(itemid_results_list))`
			`item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,`
			`len(itemid_results_list), 20))]`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`return item_id_results`

deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()`
added lambda function to multithreader in FindingApi method 2020-12-25 19:15:20 +00:00			`# for category_id in cat_list:`

changed big_data names to training 2020-10-18 07:08:04 +00:00			`class ShoppingApi:`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`'''`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`Creates objects from ShoppingApi service calls that can interact with`
			`pandas dataframes`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`'''`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`def get_item_from_findItemsByCategory(self, twenty_id):`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
Changed to_csv mode to append to allow for writing if not present and appending if present 2020-10-18 20:56:16 +00:00			`Gets raw JSON data from multiple live listings given multiple itemIds`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`params = {`
			`"callname":"GetMultipleItems",`
			`"appid":"scottbea-xlister-PRD-6796e0ff6-14862949",`
			`"version":"671",`
			`"responseencoding":"JSON",`
			`"ItemID":twenty_id,`
			`"IncludeSelector":"ItemSpecifics",`
			`}`

			`response = requests.get("https://open.api.ebay.com/shopping?", params=params)`
			`response = response.json()`
wtf 2020-12-13 01:56:51 +00:00			`response = response['Item']`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`return response`

			`def conky(self):`
wtf 2020-12-13 01:56:51 +00:00			`'''`
			`For some reason item_id_results can only be passed as argument in executor.map`
			`if the variable is made within function`
			`'''`
working on curate_data to remove formatting issues in csv 2020-12-29 08:55:21 +00:00			`data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though`
wtf 2020-12-13 01:56:51 +00:00			`finding = FindingApi(4, 2)`
			`item_id_results = finding.get_ids_from_cats()`
added multithreading to both finding and shopping apis including conky() def 2020-11-12 20:22:51 +00:00			`with concurrent.futures.ThreadPoolExecutor() as executor:`
wtf 2020-12-13 01:56:51 +00:00			`for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):`
working on curate_data to remove formatting issues in csv 2020-12-29 08:55:21 +00:00			`# print(future)`
			`for item in future:`
			`data.append(item) # The end result should be a list of dicts where each dict in the list is a listing`
need to fix data.update in conky to be list of dicts not dict of dicts 2020-12-29 07:20:55 +00:00			`# data.update(future)`
			`return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.`
			`# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.`
attempting curate by starting with pandas df first 2021-01-23 06:21:56 +00:00			`# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`class CurateData:`
			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`Contains functions for curating data for machine learning training sets;`
			`Takes item in data from ShoppingApi request as argument and extracts/ creates key`
			`value pairs that gets updated to custom dataframe used in Ml training sets.`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
added nvl_dict function to create series based on nvl dict of raw data 2021-01-27 06:01:00 +00:00			`def import_raw(self):`
			`with open('raw_data.txt') as f:`
			`raw_data = json.load(f)`
			`return raw_data`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
attempting curate by starting with pandas df first 2021-01-23 06:21:56 +00:00			`def data_frame(self, data):`
			`to_json = json.dumps(data)`
			`raw_df = pd.read_json(to_json)`
			`return raw_df`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00
added nvl_dict function to create series based on nvl dict of raw data 2021-01-27 06:01:00 +00:00			`def to_training(self, data):`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`raw_df = self.data_frame(data)`
			`interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]`
added nvl_dict function to create series based on nvl dict of raw data 2021-01-27 06:01:00 +00:00			`interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1[['ItemID', 'PrimaryCategoryID']].astype(str)`
			`training = interm_df1`

			`return training`

			`def nvl_dict(self, training):`
			`interm_df1 = pd.Series(training.ItemSpecifics)`
			`interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])`
			`nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})`
			`return nvl_dict`

			`def update_df(self, data):`
			`'''`
			`Creates training instances for dataset. picture_url_list expanded to`
			`max available pictures with each picture url corresponding to features`
			`in common with same listing (i.e., because there are multiple pictures`
			`per listing, each picture will be its own training instance.`
			`'''`

			`# USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell`
			`# USE training.apply(func, axis= something) to create your custom nvl_dict for each cell`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.`

added nvl_dict function to create series based on nvl dict of raw data 2021-01-27 06:01:00 +00:00			`# USE pd.merge() at some point...possibly after expanding lists and nvl`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed`
			`# df. then you can take the raw_df and combine it with the old raw_df for backup.`

adding gameplan for constructing two csv file from pandas after shopping api call; for main data and training set 2021-01-23 08:38:04 +00:00			`# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df`
			`# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and`
			`# the other for updating your training set.`
attempting curate by starting with pandas df first 2021-01-23 06:21:56 +00:00
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`def main():`
			`'''`
			`Main program creates/updates a csv file to use for ML training from live`
			`ebay listings`
			`'''`
deleted curate defs that work from json directly instead of pandas df 2021-01-25 05:46:55 +00:00			`pass`
			`# main goes here:`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`if __name__ == "__main__":`
			`main()`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items`
			`# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have`
			`# to divide these up into the categories. This will leave you with about 6.25K results per cat.`
			`# More than enough data for your dataset.`

added nvl_dict function to create series based on nvl dict of raw data 2021-01-27 06:01:00 +00:00			`# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which`
			`# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.`

adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO`
			`# TO AVOID HICCUPS WHEN CREATING DATASET`
wtf 2020-12-13 01:56:51 +00:00			# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.