adding csv file save and appending functionality where needed

This commit is contained in:
spbeach46 2021-05-11 11:11:59 -07:00
parent c8ab1b13d9
commit 35cdf8374f

View File

@ -20,7 +20,7 @@ class FindingApi:
'findItemsAdvanced', 'findCompletedItems',
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
'findItemsByProduct'
][service]
][service] # Currently using only index 4, i.e., service = 4
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
# examples of additional params you may want to add:
@ -28,45 +28,11 @@ class FindingApi:
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
def update_cats(self):
parent_cats = ['3034', '93427']
cat_list = []
for department in parent_cats:
params = {
"callname":"GetCategoryInfo",
"appid":cfg.sec['SECURITY-APPNAME'],
"version":"671",
"responseencoding":"JSON",
"CategoryID":department,
"IncludeSelector":"ChildCategories",
}
try:
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
response.raise_for_status()
except requests.exceptions.RequestException:
print('connection error')
response = response.json()
response = response['CategoryArray']['Category'][1:]
temp_cat_list = [cat['CategoryID'] for cat in response]
cat_list.extend(temp_cat_list)
with open('cat_list.txt', 'w') as f:
json.dump(cat_list, f)
# leaf_list = [node['LeafCategory'] for node in response]
return cat_list
def get_data(self, category_id, i):
'''
Gets raw JSON data fom FindingApi service call
Currently being used to get itemIDs from categories
Gets raw JSON data fom FindingApi service call. Currently being used to
get itemIDs from categories;
'''
params = {
@ -144,6 +110,45 @@ class ShoppingApi:
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
'''
def update_cats(self):
'''
Updates cat_list.txt
'''
parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
cat_list = []
for department in parent_cats:
params = {
"callname":"GetCategoryInfo",
"appid":cfg.sec['SECURITY-APPNAME'],
"version":"671",
"responseencoding":"JSON",
"CategoryID":department,
"IncludeSelector":"ChildCategories",
}
try:
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
response.raise_for_status()
except requests.exceptions.RequestException:
print('connection error')
response = response.json()
response = response['CategoryArray']['Category'][1:] # excludes index
# 0 as this is parent node, i.e., women's or men's dept.
temp_cat_list = [cat['CategoryID'] for cat in response]
cat_list.extend(temp_cat_list)
with open('cat_list.txt', 'w') as f:
json.dump(cat_list, f)
# leaf_list = [node['LeafCategory'] for node in response]
def get_item_from_findItemsByCategory(self, twenty_id):
'''
Gets raw JSON data from multiple live listings given multiple itemIds
@ -157,7 +162,6 @@ class ShoppingApi:
"IncludeSelector":"ItemSpecifics",
}
# TODO Add try excepts here
try:
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
response.raise_for_status()
@ -175,21 +179,33 @@ class ShoppingApi:
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
data for creating training sets
'''
data = []
try:
with open('raw_data.txt') as f:
data = json.load(f)
except (FileNotFoundError, ValueError):
data = []
finding = FindingApi(4, 2) # TODO replace these test values before production
item_id_results = finding.get_ids_from_cats()
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
# print(future)
for item in future:
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
# data.update(future)
# TODO save data here. You'll use this with your curate data class. SAve this as text file
with open('raw_data.txt', 'w') as f:
json.dump(data, f)
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
# NOTE:
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
class CurateData:
'''
Contains methods for curating data for machine learning training sets;
@ -199,7 +215,8 @@ class CurateData:
def import_raw(self):
'''
imports raw response json from local file
imports raw response json from local file. This is data from
GetMultipleItems call in ShoppingApi
'''
with open('raw_data.txt') as f:
raw_data = json.load(f)
@ -212,7 +229,7 @@ class CurateData:
'''
to_json = json.dumps(raw_data)
raw_df = pd.read_json(to_json)
return raw_df
return raw_df # TODO save csv here?
def to_training(self, raw_data): # NOTE need to create copies not views
'''
@ -293,20 +310,24 @@ class CurateData:
dropd = nvl_training.drop(col_drop, axis=1)
return dropd
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
def expand_nvlclass(self, class_training, dropd):
'''
takes image url list from each cell and expands them into separate/duplicate
instances. Modifies both class training and dropd dfs. Appends custom
image url dict {'source':'target'}.
'''
expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
expanded_class = expanded_class.dropna(subset=['PictureURL'])
expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
@ -325,7 +346,9 @@ class CurateData:
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
# TODO still need to save these as csv files
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
# TODO open csv here, drop duplicates and save again unless there's a better way
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
return expanded_class, expanded_dropd
def dl_pictures(self, *args):
@ -431,16 +454,3 @@ def main():
if __name__ == "__main__":
main()
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)