ebay-ml-lister/ebay_api.py

import concurrent.futures
import json
import requests
import pandas as pd

class FindingApi:
    '''Methods for accessing eBays FindingApi services'''
    def __init__(self, service, pageNumber):
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
            'findItemsByProduct'
            ][service]
        self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended
        # this will give equal weights to cats given call constraints

    # departments = ["3034","93427"] (womens and mens)
    def get_data(self): # TODO FIX THIS TO WORK WITH MULTITHREADING. Need to figure out how to handle data variable. Simplest solution would be to append in def and every call of def in thread. every other method in following classes depend on data variable generated from here. You'll need to decide on way to append data var.
        '''Gets raw JSON data fom FindingApi service call
        '''
        with open('cat_list.txt') as jf:
            cat_list = json.load(jf)
        for category_id in cat_list:
            for i in self.pageNumber:
                params = {
                    "OPERATION-NAME":self.service,
                    "SECURITY-APPNAME":"scottbea-xlister-PRD-6796e0ff6-14862949",
                    "SERVICE-VERSION":"1.13.0",
                    "RESPONSE-DATA-FORMAT":"JSON",
                    "categoryId":category_id,
                    "paginationInput.entriesPerPage":"100",
                    "paginationInput.PageNumber":i
                    }
                response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
                        params=params)
                data = response.json()
            return data # TODO MAY HAVE TO RUN IN THREADPOOLEXECUTOR() IN MAIN() AND SAVE DATA TO FILE

    def get_ids_from_cats(self):
        '''
        Creates a 20-itemId list to use for the ShoppingApi
        call
        '''
        data = self.get_data() # TODO consider using different variable names to avoid confusion between FindingApi data and ShoppingApi data
        itemid_results_list = []

        try:
            training = pd.read_csv('training.csv')
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                if (item not in training.values) and (item not in itemid_results_list):
                    itemid_results_list.append(item['itemId'][0]) # TODO something funky going on here. zeroth index? why? itemIds from FindingApi call are in lists (due to "variations" listings) 

        except (pd.errors.EmptyDataError, FileNotFoundError):
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                if item not in itemid_results_list:
                    itemid_results_list.append(item['itemId'][0])

        item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
            len(itemid_results_list), 20))]

        return item_id_results

class ShoppingApi:
    '''
    Creates objects from ShoppingApi service calls that can interact with
    pandas dataframes
    '''
    def get_item_from_findItemsByCategory(self, item_id_results):
        '''
        Gets raw JSON data from multiple live listings given multiple itemIds
        '''
        for twenty_id in item_id_results:
            params = {
                "callname":"GetMultipleItems",
                "appid":"scottbea-xlister-PRD-6796e0ff6-14862949",
                "version":"671",
                "responseencoding":"JSON",
                "ItemID":twenty_id,
                "IncludeSelector":"ItemSpecifics",
                }

            response = requests.get("https://open.api.ebay.com/shopping?", params=params)
            data = response.json()
            return data # TODO save data as file??

class CurateData:
    '''
    Contains functions for curating data for machine learning training sets;
    Takes item in data from ShoppingApi request as argument and extracts/ creates key
    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''
    def extract_itemId(self, item):
        item_id = {'ItemID':item['ItemID']}
        return item_id

    def extract_catId(self, item):
        catId = {'PrimaryCategoryID':item['PrimaryCategoryID']}
        return catId

    def extract_prime_cat_name(self, item):
        prime_cat_name= {'PrimaryCategoryName':item['PrimaryCategoryName']}
        return prime_cat_name

    def extract_picture_url(self, item):
        '''
        Only pulls PictureURL list and does not
        create dictionary
        '''
        picture_url_list = item['PictureURL']
        return picture_url_list

    def extract_nvl(self, item):
        names = []
        values = []
        nvl = item['itemspecifics']['namevaluelist']
        for nvl_dict in nvl:
            names.append(nvl_dict['name'])
            values.append(nvl_dict['value'])
        nvl_dict = dict(zip(names, values))
        return nvl_dict

    def update_df(self, data):
        '''
        Creates training instances for dataset. picture_url_list expanded to
        max available pictures with each picture url corresponding to features
        in common with same listing (i.e., because there are multiple pictures
        per listing, each picture will be its own training instance.
        '''
        for item in data['item']:

            training = {} # TODO something funky going on here
            # NEED TO CREATE EMPTY DICT OUTSIDE OF FOR LOOP?
            picture_url_list = self.extract_picture_url(item)

            '''
            Creates same training instance per photo for
            '''
            for url in picture_url_list:
                remote_url = {'PictureURL':url}
                training.update(remote_url)
                item_id = self.extract_itemId(item)
                training.update(item_id)
                catId = self.extract_catId(item)
                training.update(catId)
                prime_cat_name = self.extract_prime_cat_name(item)
                training.update(prime_cat_name)
                nvl_dict = self.extract_nvl(item)
                training.update(nvl_dict)

        df = pd.json_normalize(training) # TODO FIX INDENT HERE?
        df.to_csv('training.csv', mode='a')

def main():
    '''
    Main program creates/updates a csv file to use for ML training from live
    ebay listings
    '''
    service, pageNumber = input('service and pageNumber:').split()
    service = int(service)
    pageNumber = int(pageNumber)
    finding = FindingApi(service, pageNumber)
    # TODO START MULTITHREADING HERE FOR FINDINGAPI CALL?
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for future in executor.map(finding.get_ids_from_cats(), finding.pageNumber):

    item_id_results = finding.get_ids_from_cats()
    shopping = ShoppingApi()
    data = shopping.get_item_from_findItemsByCategory(item_id_results)
    curate = CurateData()
    curate.update_df(data)

if __name__ == "__main__":
    main()

# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.

# Need to make sure dataframe gets important stuff outside of nvl in order to
# access values for cross referencing itemIds from calls
# Need to decide if list gets accessed from df or if you're just going to have
# list contents extracted and possibly placed into separate cells/labels

# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`import concurrent.futures`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`import json`
			`import requests`
			`import pandas as pd`

			`class FindingApi:`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`'''Methods for accessing eBays FindingApi services'''`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`def __init__(self, service, pageNumber):`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`self.service = [`
			`'findItemsAdvanced', 'findCompletedItems',`
			`'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',`
			`'findItemsByProduct'`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`][service]`
			`self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00			`# this will give equal weights to cats given call constraints`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
			`# departments = ["3034","93427"] (womens and mens)`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`def get_data(self): # TODO FIX THIS TO WORK WITH MULTITHREADING. Need to figure out how to handle data variable. Simplest solution would be to append in def and every call of def in thread. every other method in following classes depend on data variable generated from here. You'll need to decide on way to append data var.`
			`'''Gets raw JSON data fom FindingApi service call`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
			`with open('cat_list.txt') as jf:`
			`cat_list = json.load(jf)`
added get_data method for easier debugging. Added zeroth index term to ...['itemId'] 2020-10-13 00:55:07 +00:00			`for category_id in cat_list:`
			`for i in self.pageNumber:`
			`params = {`
added get_data method call to get_ids_from_cats call 2020-10-13 01:42:57 +00:00			`"OPERATION-NAME":self.service,`
added get_data method for easier debugging. Added zeroth index term to ...['itemId'] 2020-10-13 00:55:07 +00:00			`"SECURITY-APPNAME":"scottbea-xlister-PRD-6796e0ff6-14862949",`
			`"SERVICE-VERSION":"1.13.0",`
			`"RESPONSE-DATA-FORMAT":"JSON",`
			`"categoryId":category_id,`
			`"paginationInput.entriesPerPage":"100",`
added get_data method call to get_ids_from_cats call 2020-10-13 01:42:57 +00:00			`"paginationInput.PageNumber":i`
added get_data method for easier debugging. Added zeroth index term to ...['itemId'] 2020-10-13 00:55:07 +00:00			`}`
			`response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",`
			`params=params)`
			`data = response.json()`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`return data # TODO MAY HAVE TO RUN IN THREADPOOLEXECUTOR() IN MAIN() AND SAVE DATA TO FILE`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
			`def get_ids_from_cats(self):`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`'''`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`Creates a 20-itemId list to use for the ShoppingApi`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`call`
			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`data = self.get_data() # TODO consider using different variable names to avoid confusion between FindingApi data and ShoppingApi data`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`itemid_results_list = []`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`try:`
Changed to_csv mode to append to allow for writing if not present and appending if present 2020-10-18 20:56:16 +00:00			`training = pd.read_csv('training.csv')`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:`
added conditional to check if item id in both item_id_results_list and in local file to eliminate variations listings including written data 2020-10-31 04:55:40 +00:00			`if (item not in training.values) and (item not in itemid_results_list):`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`itemid_results_list.append(item['itemId'][0]) # TODO something funky going on here. zeroth index? why? itemIds from FindingApi call are in lists (due to "variations" listings)`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
had to convert inputs to ints and correct filenotfounderror in get_ids_from_cats fuction 2020-10-18 22:32:17 +00:00			`except (pd.errors.EmptyDataError, FileNotFoundError):`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00			`if item not in itemid_results_list:`
corrected update_df to iterate over items and nvl kv pairs. and began check for repeat results 2020-10-24 10:36:31 +00:00			`itemid_results_list.append(item['itemId'][0])`
removed Unbound assignment error for accessing training.csv during exception handling. working on runtimeerror for mutable dictionary in curate_data loop 2020-11-05 22:32:41 +00:00
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,`
			`len(itemid_results_list), 20))]`

			`return item_id_results`

changed big_data names to training 2020-10-18 07:08:04 +00:00			`class ShoppingApi:`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`'''`
corrected pageNumber and service params. Need to fix typeError on line 24-27 2020-10-12 18:48:15 +00:00			`Creates objects from ShoppingApi service calls that can interact with`
			`pandas dataframes`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`'''`
			`def get_item_from_findItemsByCategory(self, item_id_results):`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
Changed to_csv mode to append to allow for writing if not present and appending if present 2020-10-18 20:56:16 +00:00			`Gets raw JSON data from multiple live listings given multiple itemIds`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`for twenty_id in item_id_results:`
			`params = {`
			`"callname":"GetMultipleItems",`
			`"appid":"scottbea-xlister-PRD-6796e0ff6-14862949",`
			`"version":"671",`
			`"responseencoding":"JSON",`
			`"ItemID":twenty_id,`
			`"IncludeSelector":"ItemSpecifics",`
			`}`

			`response = requests.get("https://open.api.ebay.com/shopping?", params=params)`
			`data = response.json()`
had to convert inputs to ints and correct filenotfounderror in get_ids_from_cats fuction 2020-10-18 22:32:17 +00:00			`return data # TODO save data as file??`
Changed to_csv mode to append to allow for writing if not present and appending if present 2020-10-18 20:56:16 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`class CurateData:`
			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`Contains functions for curating data for machine learning training sets;`
			`Takes item in data from ShoppingApi request as argument and extracts/ creates key`
			`value pairs that gets updated to custom dataframe used in Ml training sets.`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`def extract_itemId(self, item):`
			`item_id = {'ItemID':item['ItemID']}`
			`return item_id`

			`def extract_catId(self, item):`
			`catId = {'PrimaryCategoryID':item['PrimaryCategoryID']}`
			`return catId`

			`def extract_prime_cat_name(self, item):`
			`prime_cat_name= {'PrimaryCategoryName':item['PrimaryCategoryName']}`
			`return prime_cat_name`

			`def extract_picture_url(self, item):`
			`'''`
			`Only pulls PictureURL list and does not`
			`create dictionary`
			`'''`
			`picture_url_list = item['PictureURL']`
			`return picture_url_list`

			`def extract_nvl(self, item):`
			`names = []`
			`values = []`
			`nvl = item['itemspecifics']['namevaluelist']`
			`for nvl_dict in nvl:`
			`names.append(nvl_dict['name'])`
			`values.append(nvl_dict['value'])`
			`nvl_dict = dict(zip(names, values))`
			`return nvl_dict`

added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`def update_df(self, data):`
had to convert inputs to ints and correct filenotfounderror in get_ids_from_cats fuction 2020-10-18 22:32:17 +00:00			`'''`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`Creates training instances for dataset. picture_url_list expanded to`
			`max available pictures with each picture url corresponding to features`
			`in common with same listing (i.e., because there are multiple pictures`
			`per listing, each picture will be its own training instance.`
had to convert inputs to ints and correct filenotfounderror in get_ids_from_cats fuction 2020-10-18 22:32:17 +00:00			`'''`
adding functions to curateData class 2020-11-07 17:39:49 +00:00			`for item in data['item']:`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
			`training = {} # TODO something funky going on here`
			`# NEED TO CREATE EMPTY DICT OUTSIDE OF FOR LOOP?`
			`picture_url_list = self.extract_picture_url(item)`

			`'''`
			`Creates same training instance per photo for`
			`'''`
			`for url in picture_url_list:`
			`remote_url = {'PictureURL':url}`
			`training.update(remote_url)`
			`item_id = self.extract_itemId(item)`
			`training.update(item_id)`
			`catId = self.extract_catId(item)`
			`training.update(catId)`
			`prime_cat_name = self.extract_prime_cat_name(item)`
			`training.update(prime_cat_name)`
			`nvl_dict = self.extract_nvl(item)`
			`training.update(nvl_dict)`

			`df = pd.json_normalize(training) # TODO FIX INDENT HERE?`
Changed to_csv mode to append to allow for writing if not present and appending if present 2020-10-18 20:56:16 +00:00			`df.to_csv('training.csv', mode='a')`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`def main():`
			`'''`
			`Main program creates/updates a csv file to use for ML training from live`
			`ebay listings`
			`'''`
			`service, pageNumber = input('service and pageNumber:').split()`
had to convert inputs to ints and correct filenotfounderror in get_ids_from_cats fuction 2020-10-18 22:32:17 +00:00			`service = int(service)`
			`pageNumber = int(pageNumber)`
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`finding = FindingApi(service, pageNumber)`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00			`# TODO START MULTITHREADING HERE FOR FINDINGAPI CALL?`
			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`for future in executor.map(finding.get_ids_from_cats(), finding.pageNumber):`

added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`item_id_results = finding.get_ids_from_cats()`
			`shopping = ShoppingApi()`
			`data = shopping.get_item_from_findItemsByCategory(item_id_results)`
			`curate = CurateData()`
			`curate.update_df(data)`
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00
added error handling for initial/empty csv file 2020-10-18 00:22:45 +00:00			`if __name__ == "__main__":`
			`main()`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
comment before adding pagenumber and service to init 2020-10-12 07:53:29 +00:00			`# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items`
			`# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have`
			`# to divide these up into the categories. This will leave you with about 6.25K results per cat.`
			`# More than enough data for your dataset.`

added CurateData class for preprocessing pipeline fucntionality 2020-10-17 23:21:11 +00:00			`# Need to make sure dataframe gets important stuff outside of nvl in order to`
			`# access values for cross referencing itemIds from calls`
			`# Need to decide if list gets accessed from df or if you're just going to have`
			`# list contents extracted and possibly placed into separate cells/labels`
adding multithreading to first call in main 2020-11-09 01:47:03 +00:00
			`# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO`
			`# TO AVOID HICCUPS WHEN CREATING DATASET`