attempting curate by starting with pandas df first

This commit is contained in:
spbeach46 2021-01-22 23:21:56 -07:00
parent b4e1961ace
commit fd6fd04ecd

View File

@ -1,3 +1,5 @@
import importlib
import numpy as np
import concurrent.futures
import json
import requests
@ -117,7 +119,7 @@ class ShoppingApi:
# data.update(future)
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
class CurateData:
'''
Contains functions for curating data for machine learning training sets;
@ -169,21 +171,27 @@ class CurateData:
'''
Creates same training instance per photo for
'''
for url in picture_url_list:
remote_url = {'PictureURL':url}
training.update(remote_url)
item_id = self.extract_itemId(item)
training.update(item_id)
catId = self.extract_catId(item)
training.update(catId)
prime_cat_name = self.extract_prime_cat_name(item)
training.update(prime_cat_name)
nvl_dict = self.extract_nvl(item)
training.update(nvl_dict)
# for url in picture_url_list: # maybe try removing for loop to see if csv updates correctly here
# remote_url = {'PictureURL':url}
# training.update(remote_url)
item_id = self.extract_itemId(item)
training.update(item_id)
catId = self.extract_catId(item)
training.update(catId)
prime_cat_name = self.extract_prime_cat_name(item)
training.update(prime_cat_name)
nvl_dict = self.extract_nvl(item)
training.update(nvl_dict)
df = pd.json_normalize(training) # TODO FIX INDENT HERE?
#df.to_csv('training.csv', mode='a')
print(training) # after looking at the training output it looks like everything might be out of order due possibly to multithreading issues. Due to this you may have to use a more finegrained
# multithreading module
def data_frame(self, data):
to_json = json.dumps(data)
raw_df = pd.read_json(to_json)
return raw_df
df = pd.json_normalize(training) # TODO FIX INDENT HERE?
df.to_csv('training.csv', mode='a')
return training
def main():
'''