adding multithreading to first call in main

2020-11-08 18:47:03 -07:00 · 2020-11-08 18:47:03 -07:00 · 1134d3f155
commit 1134d3f155
parent 5965f19d2a
1 changed files with 72 additions and 42 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@ -1,9 +1,10 @@
+import concurrent.futures
 import json
 import requests
 import pandas as pd

 class FindingApi:
-    '''Some docstring to get rid of linting errors'''
+    '''Methods for accessing eBays FindingApi services'''
    def __init__(self, service, pageNumber):
        self.service = [
            'findItemsAdvanced', 'findCompletedItems',
@ -14,8 +15,8 @@ class FindingApi:
        # this will give equal weights to cats given call constraints

    # departments = ["3034","93427"] (womens and mens)
-    def get_data(self):
-        '''# Gets raw JSON data fom FindingApi service call
+    def get_data(self): # TODO FIX THIS TO WORK WITH MULTITHREADING. Need to figure out how to handle data variable. Simplest solution would be to append in def and every call of def in thread. every other method in following classes depend on data variable generated from here. You'll need to decide on way to append data var.
+        '''Gets raw JSON data fom FindingApi service call
        '''
        with open('cat_list.txt') as jf:
            cat_list = json.load(jf)
@ -33,22 +34,21 @@ class FindingApi:
                response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
                        params=params)
                data = response.json()
-            return data
+            return data # TODO MAY HAVE TO RUN IN THREADPOOLEXECUTOR() IN MAIN() AND SAVE DATA TO FILE

    def get_ids_from_cats(self):
        '''
        Creates a 20-itemId list to use for the ShoppingApi
        call
        '''
-        data = self.get_data()
+        data = self.get_data() # TODO consider using different variable names to avoid confusion between FindingApi data and ShoppingApi data
        itemid_results_list = []

-        try:# TODO run pdb here to see how to extract itemId before update_df
+        try:
            training = pd.read_csv('training.csv')
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
                if (item not in training.values) and (item not in itemid_results_list):
-                    itemid_results_list.append(item['itemId'][0]) # itemId
-                    # values are in lists for some reason
+                    itemid_results_list.append(item['itemId'][0]) # TODO something funky going on here. zeroth index? why? itemIds from FindingApi call are in lists (due to "variations" listings) 

        except (pd.errors.EmptyDataError, FileNotFoundError):
            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
@ -85,47 +85,69 @@ class ShoppingApi:

 class CurateData:
    '''
-    Contains functions for curating data for machine learning training sets
+    Contains functions for curating data for machine learning training sets;
+    Takes item in data from ShoppingApi request as argument and extracts/ creates key
+    value pairs that gets updated to custom dataframe used in Ml training sets.
    '''
-    def extract_itemid(self, data):
-        for item in data['Item']:
-           item_id = ['ItemID']
-    def extract_prime_cat(self, data):
-        for item in data['Item']:
-            prime_cat = ['PrimaryCategory']
-    def extract_picture_url(self, data):
-        for item in data['Item']:
-            picture_url_list = ['PictureURL']
-    def extract_nvl(self, data):
-        for item in data['item']:
-            training = {}
-            names = []
-            values = []
-            nvl = item['itemspecifics']['namevaluelist']
-            for nvl_dict in nvl:
-                names.append(nvl_dict['name'])
-                values.append(nvl_dict['value'])
-            # todo also append itemid and value to the dictionary somewhere
+    def extract_itemId(self, item):
+        item_id = {'ItemID':item['ItemID']}
+        return item_id
+
+    def extract_catId(self, item):
+        catId = {'PrimaryCategoryID':item['PrimaryCategoryID']}
+        return catId
+
+    def extract_prime_cat_name(self, item):
+        prime_cat_name= {'PrimaryCategoryName':item['PrimaryCategoryName']}
+        return prime_cat_name
+
+    def extract_picture_url(self, item):
+        '''
+        Only pulls PictureURL list and does not
+        create dictionary
+        '''
+        picture_url_list = item['PictureURL']
+        return picture_url_list
+
+    def extract_nvl(self, item):
+        names = []
+        values = []
+        nvl = item['itemspecifics']['namevaluelist']
+        for nvl_dict in nvl:
+            names.append(nvl_dict['name'])
+            values.append(nvl_dict['value'])
+        nvl_dict = dict(zip(names, values))
+        return nvl_dict

-            nvl_dict = dict(zip(names, values))
    def update_df(self, data):
        '''
-        Extracts itemIds and name-value list , creates new dict and appends df
+        Creates training instances for dataset. picture_url_list expanded to
+        max available pictures with each picture url corresponding to features
+        in common with same listing (i.e., because there are multiple pictures
+        per listing, each picture will be its own training instance.
        '''
        for item in data['item']:
-            training = {}
-            names = []
-            values = []
-            nvl = item['itemspecifics']['namevaluelist']
-            for nvl_dict in nvl:
-                names.append(nvl_dict['name'])
-                values.append(nvl_dict['value'])
-            # todo also append itemid and value to the dictionary somewhere

-            nvl_dict = dict(zip(names, values))
-            training.update(nvl_dict) # todo just creating a training variable will not include itemid, picture urls, or categories which you will still need for your df so you can either extract them and append them to training or you can solely modify the data variable but you will have to deal with runtime error.
-        # probably best to extract and making custom df
-        df = pd.json_normalize(training)
+            training = {} # TODO something funky going on here
+            # NEED TO CREATE EMPTY DICT OUTSIDE OF FOR LOOP?
+            picture_url_list = self.extract_picture_url(item)
+
+            '''
+            Creates same training instance per photo for
+            '''
+            for url in picture_url_list:
+                remote_url = {'PictureURL':url}
+                training.update(remote_url)
+                item_id = self.extract_itemId(item)
+                training.update(item_id)
+                catId = self.extract_catId(item)
+                training.update(catId)
+                prime_cat_name = self.extract_prime_cat_name(item)
+                training.update(prime_cat_name)
+                nvl_dict = self.extract_nvl(item)
+                training.update(nvl_dict)
+
+        df = pd.json_normalize(training) # TODO FIX INDENT HERE?
        df.to_csv('training.csv', mode='a')

 def main():
@ -137,6 +159,10 @@ def main():
    service = int(service)
    pageNumber = int(pageNumber)
    finding = FindingApi(service, pageNumber)
+    # TODO START MULTITHREADING HERE FOR FINDINGAPI CALL?
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        for future in executor.map(finding.get_ids_from_cats(), finding.pageNumber):
+
    item_id_results = finding.get_ids_from_cats()
    shopping = ShoppingApi()
    data = shopping.get_item_from_findItemsByCategory(item_id_results)
@ -145,6 +171,7 @@ def main():

 if __name__ == "__main__":
    main()
+
 # Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
 # per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
 # to divide these up into the categories. This will leave you with about 6.25K results per cat.
@ -154,3 +181,6 @@ if __name__ == "__main__":
 # access values for cross referencing itemIds from calls
 # Need to decide if list gets accessed from df or if you're just going to have
 # list contents extracted and possibly placed into separate cells/labels
+
+# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
+# TO AVOID HICCUPS WHEN CREATING DATASET