From fd6fd04ecd21c32299d8866974ea6a0fcd16a2a1 Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Fri, 22 Jan 2021 23:21:56 -0700 Subject: [PATCH] attempting curate by starting with pandas df first --- ebay_api.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/ebay_api.py b/ebay_api.py index 92d394c..2ceeffb 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -1,3 +1,5 @@ +import importlib +import numpy as np import concurrent.futures import json import requests @@ -117,7 +119,7 @@ class ShoppingApi: # data.update(future) return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list. # data dictionary can't update from list of dicts unless iterated over. Might need a different way to update. - +# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself. class CurateData: ''' Contains functions for curating data for machine learning training sets; @@ -169,21 +171,27 @@ class CurateData: ''' Creates same training instance per photo for ''' - for url in picture_url_list: - remote_url = {'PictureURL':url} - training.update(remote_url) - item_id = self.extract_itemId(item) - training.update(item_id) - catId = self.extract_catId(item) - training.update(catId) - prime_cat_name = self.extract_prime_cat_name(item) - training.update(prime_cat_name) - nvl_dict = self.extract_nvl(item) - training.update(nvl_dict) + # for url in picture_url_list: # maybe try removing for loop to see if csv updates correctly here + # remote_url = {'PictureURL':url} + # training.update(remote_url) + item_id = self.extract_itemId(item) + training.update(item_id) + catId = self.extract_catId(item) + training.update(catId) + prime_cat_name = self.extract_prime_cat_name(item) + training.update(prime_cat_name) + nvl_dict = self.extract_nvl(item) + training.update(nvl_dict) + + df = pd.json_normalize(training) # TODO FIX INDENT HERE? + #df.to_csv('training.csv', mode='a') + print(training) # after looking at the training output it looks like everything might be out of order due possibly to multithreading issues. Due to this you may have to use a more finegrained + # multithreading module + def data_frame(self, data): + to_json = json.dumps(data) + raw_df = pd.read_json(to_json) + return raw_df - df = pd.json_normalize(training) # TODO FIX INDENT HERE? - df.to_csv('training.csv', mode='a') - return training def main(): '''