added dict for service and page input options

2021-05-25 18:47:23 -07:00
parent 35cdf8374f
commit 152b72f5df
1 changed files with 45 additions and 7 deletions
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -24,7 +24,7 @@ class FindingApi:
        self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints

    # examples of additional params you may want to add:
-    # 'itemFilter(0).value':'Used'
+    # 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
    # 'itemFilter(1).name':'ListingType'
    # 'itemFilter(1).value':'AuctionWithBIN'

@@ -83,10 +83,13 @@ class FindingApi:
                for future in executor.map(lambda p: self.get_data(*p), args):
                    data = future

+                    '''
+                    These try excepts may be unnecessary. 
+                    '''
                    try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list
                        training = pd.read_csv('training.csv')
                        for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
-                            if (item not in training.values) and (item not in itemid_results_list):
+                            if (item not in training.values) and (item not in itemid_results_list): # might not be required
                                itemid_results_list.append(item['itemId'][0])

                    except (pd.errors.EmptyDataError, FileNotFoundError):
@@ -184,7 +187,21 @@ class ShoppingApi:
                data = json.load(f)
        except (FileNotFoundError, ValueError):
            data = []
-        finding = FindingApi(4, 2) # TODO replace these test values before production
+
+        service_dict = {
+                0: 'findItemsAdvanced', 1: 'findCompletedItems',
+                2: 'findItemsAdvanced', 3: 'findCompletedItems',
+                4: 'findItemsByProduct'}
+        service_dict
+
+        fnd_srvc = int(input(str(service_dict) + "choose Finding call: ('press enter' for default(4))"))
+        pg_num = int(input('how many pages? (76 max)'))
+        if fnd_srvc != '':
+            finding = FindingApi(fnd_srvc, pg_num) # TODO replace these test values before production or add option to change prior to execution
+        else:
+            fnd_srvc = 4
+            finding = FindingApi(fnd_srvc, pg_num)
+
        item_id_results = finding.get_ids_from_cats()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
@@ -229,7 +246,12 @@ class CurateData:
        '''
        to_json = json.dumps(raw_data)
        raw_df = pd.read_json(to_json)
-        return raw_df # TODO save csv here?
+        raw_df.to_csv('raw_df.csv', mode='a')
+        raw_df = pd.read_csv('raw_df.csv', index_col=0)
+        raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # drops dupes after appending new data. (ShoppingApi call might include dupes)
+        raw_df.to_csv('raw_df.csv', mode='a') # TODO this might still only save the unmodified/undropped original. check to make sure 
+
+        return raw_df

    def to_training(self, raw_data): # NOTE need to create copies not views 
        '''
@@ -346,9 +368,18 @@ class CurateData:
            with open('temp_pics_source_list.txt', 'w') as f:
                json.dump(temp_pics_source_list, f)

-        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
-        # TODO open csv here, drop duplicates and save again unless there's a better way
-        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
+        # Append to master training dataframes, drop potential dupes and save
+
+        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8')
+        expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
+        expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
+
+        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
+        expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
+        expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
+        expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
+
        return expanded_class, expanded_dropd

    def dl_pictures(self, *args):
@@ -454,3 +485,10 @@ def main():
 if __name__ == "__main__":
    main()

+'''
+Based on your sample set of 10 images, if you have an average of 5 images per
+listing and you download a hundred listings, you will have about 102 Gb of 
+image data. That's just for one day. If you have more than a million listings
+you're looking at a little over 1Tb of image data. You don't even know if this
+is good data yet.
+'''