attempting curate by starting with pandas df first
This commit is contained in:
parent
b4e1961ace
commit
fd6fd04ecd
38
ebay_api.py
38
ebay_api.py
@ -1,3 +1,5 @@
|
||||
import importlib
|
||||
import numpy as np
|
||||
import concurrent.futures
|
||||
import json
|
||||
import requests
|
||||
@ -117,7 +119,7 @@ class ShoppingApi:
|
||||
# data.update(future)
|
||||
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
|
||||
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
|
||||
|
||||
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
|
||||
class CurateData:
|
||||
'''
|
||||
Contains functions for curating data for machine learning training sets;
|
||||
@ -169,21 +171,27 @@ class CurateData:
|
||||
'''
|
||||
Creates same training instance per photo for
|
||||
'''
|
||||
for url in picture_url_list:
|
||||
remote_url = {'PictureURL':url}
|
||||
training.update(remote_url)
|
||||
item_id = self.extract_itemId(item)
|
||||
training.update(item_id)
|
||||
catId = self.extract_catId(item)
|
||||
training.update(catId)
|
||||
prime_cat_name = self.extract_prime_cat_name(item)
|
||||
training.update(prime_cat_name)
|
||||
nvl_dict = self.extract_nvl(item)
|
||||
training.update(nvl_dict)
|
||||
# for url in picture_url_list: # maybe try removing for loop to see if csv updates correctly here
|
||||
# remote_url = {'PictureURL':url}
|
||||
# training.update(remote_url)
|
||||
item_id = self.extract_itemId(item)
|
||||
training.update(item_id)
|
||||
catId = self.extract_catId(item)
|
||||
training.update(catId)
|
||||
prime_cat_name = self.extract_prime_cat_name(item)
|
||||
training.update(prime_cat_name)
|
||||
nvl_dict = self.extract_nvl(item)
|
||||
training.update(nvl_dict)
|
||||
|
||||
df = pd.json_normalize(training) # TODO FIX INDENT HERE?
|
||||
#df.to_csv('training.csv', mode='a')
|
||||
print(training) # after looking at the training output it looks like everything might be out of order due possibly to multithreading issues. Due to this you may have to use a more finegrained
|
||||
# multithreading module
|
||||
def data_frame(self, data):
|
||||
to_json = json.dumps(data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df
|
||||
|
||||
df = pd.json_normalize(training) # TODO FIX INDENT HERE?
|
||||
df.to_csv('training.csv', mode='a')
|
||||
return training
|
||||
|
||||
def main():
|
||||
'''
|
||||
|
Loading…
Reference in New Issue
Block a user