From 35cdf8374fd7e7c174a51ab86b79f2539a35b959 Mon Sep 17 00:00:00 2001
From: spbeach46 <spbeacg46@gmail.com>
Date: Tue, 11 May 2021 11:11:59 -0700
Subject: [PATCH] adding csv file save and appending functionality where needed

---
 ebay_api.py | 132 ++++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 61 deletions(-)

diff --git a/ebay_api.py b/ebay_api.py
index 15665ae..fe27100 100644
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -20,7 +20,7 @@ class FindingApi:
             'findItemsAdvanced', 'findCompletedItems',
             'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
             'findItemsByProduct'
-            ][service]
+            ][service] # Currently using only index 4, i.e., service = 4
         self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
 
     # examples of additional params you may want to add:
@@ -28,45 +28,11 @@ class FindingApi:
     # 'itemFilter(1).name':'ListingType'
     # 'itemFilter(1).value':'AuctionWithBIN'
 
-    def update_cats(self):
-
-        parent_cats = ['3034', '93427']
-        cat_list = []
-
-        for department in parent_cats:
-
-            params = {
-                "callname":"GetCategoryInfo",
-                "appid":cfg.sec['SECURITY-APPNAME'],
-                "version":"671",
-                "responseencoding":"JSON",
-                "CategoryID":department,
-                "IncludeSelector":"ChildCategories",
-                }
-
-            try:
-                response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
-                response.raise_for_status()
-
-            except requests.exceptions.RequestException:
-                print('connection error')
-
-            response = response.json()
-            response = response['CategoryArray']['Category'][1:]
-            temp_cat_list = [cat['CategoryID'] for cat in response]
-            cat_list.extend(temp_cat_list)
-
-            with open('cat_list.txt', 'w') as f:
-                json.dump(cat_list, f)
-
-            # leaf_list = [node['LeafCategory'] for node in response]
-        return cat_list
-
     def get_data(self, category_id, i):
 
         '''
-        Gets raw JSON data fom FindingApi service call
-        Currently being used to get itemIDs from categories
+        Gets raw JSON data fom FindingApi service call. Currently being used to
+        get itemIDs from categories; 
         '''
 
         params = {
@@ -144,6 +110,45 @@ class ShoppingApi:
     Creates objects from ShoppingApi service calls that can interact with
     pandas dataframes
     '''
+
+    def update_cats(self):
+        '''
+        Updates cat_list.txt
+        '''
+
+        parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
+        cat_list = []
+
+        for department in parent_cats:
+
+            params = {
+                "callname":"GetCategoryInfo",
+                "appid":cfg.sec['SECURITY-APPNAME'],
+                "version":"671",
+                "responseencoding":"JSON",
+                "CategoryID":department,
+                "IncludeSelector":"ChildCategories",
+                }
+
+            try:
+                response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
+                response.raise_for_status()
+
+            except requests.exceptions.RequestException:
+                print('connection error')
+
+            response = response.json()
+            response = response['CategoryArray']['Category'][1:] # excludes index
+            # 0 as this is parent node, i.e., women's or men's dept.
+
+            temp_cat_list = [cat['CategoryID'] for cat in response]
+            cat_list.extend(temp_cat_list)
+
+            with open('cat_list.txt', 'w') as f:
+                json.dump(cat_list, f)
+
+            # leaf_list = [node['LeafCategory'] for node in response]
+
     def get_item_from_findItemsByCategory(self, twenty_id):
         '''
         Gets raw JSON data from multiple live listings given multiple itemIds
@@ -157,7 +162,6 @@ class ShoppingApi:
             "IncludeSelector":"ItemSpecifics",
             }
 
-        # TODO Add try excepts here
         try:
             response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
             response.raise_for_status()
@@ -175,21 +179,33 @@ class ShoppingApi:
         Runs get_item_from_findItemsByCategory in multiple threads to get relevant
         data for creating training sets
         '''
-        data = []
+        try:
+            with open('raw_data.txt') as f:
+                data = json.load(f)
+        except (FileNotFoundError, ValueError):
+            data = []
         finding = FindingApi(4, 2) # TODO replace these test values before production
         item_id_results = finding.get_ids_from_cats()
         with concurrent.futures.ThreadPoolExecutor() as executor:
             for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
-                #  print(future)
                 for item in future:
                         data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
                 # data.update(future)
-        # TODO save data here. You'll use this with your curate data class. SAve this as text file
+        with open('raw_data.txt', 'w') as f:
+            json.dump(data, f)
         return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list. 
 
     # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. 
 # TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. 
 
+# NOTE:
+
+# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
+# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
+# to divide these up into the categories. This will leave you with about 6.25K results per cat.
+# More than enough data for your dataset.
+
+
 class CurateData:
     '''
     Contains methods for curating data for machine learning training sets;
@@ -199,7 +215,8 @@ class CurateData:
 
     def import_raw(self):
         '''
-        imports raw response json from local file
+        imports raw response json from local file. This is data from
+        GetMultipleItems call in ShoppingApi
         '''
         with open('raw_data.txt') as f:
             raw_data = json.load(f)
@@ -212,7 +229,7 @@ class CurateData:
         '''
         to_json = json.dumps(raw_data)
         raw_df = pd.read_json(to_json)
-        return raw_df
+        return raw_df # TODO save csv here?
 
     def to_training(self, raw_data): # NOTE need to create copies not views 
         '''
@@ -293,20 +310,24 @@ class CurateData:
             dropd = nvl_training.drop(col_drop, axis=1)
         return dropd
 
+# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
+# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 
+
+# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
+
     def expand_nvlclass(self, class_training, dropd):
         '''
         takes image url list from each cell and expands them into separate/duplicate
         instances. Modifies both class training and dropd dfs. Appends custom
         image url dict {'source':'target'}.
         '''
-        expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
+        expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
         expanded_class = expanded_class.dropna(subset=['PictureURL'])
         expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        # expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
-        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
+
+        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
         expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
         expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-        # expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
 
         expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
 
@@ -325,7 +346,9 @@ class CurateData:
             with open('temp_pics_source_list.txt', 'w') as f:
                 json.dump(temp_pics_source_list, f)
 
-        # TODO still need to save these as csv files
+        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
+        # TODO open csv here, drop duplicates and save again unless there's a better way
+        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
         return expanded_class, expanded_dropd
 
     def dl_pictures(self, *args):
@@ -431,16 +454,3 @@ def main():
 if __name__ == "__main__":
     main()
 
-# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
-# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
-# to divide these up into the categories. This will leave you with about 6.25K results per cat.
-# More than enough data for your dataset.
-
-# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
-# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 
-
-# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
-# TO AVOID HICCUPS WHEN CREATING DATASET
-# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags. 
-
-# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)