adding csv file save and appending functionality where needed
This commit is contained in:
parent
c8ab1b13d9
commit
35cdf8374f
132
ebay_api.py
132
ebay_api.py
@ -20,7 +20,7 @@ class FindingApi:
|
||||
'findItemsAdvanced', 'findCompletedItems',
|
||||
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
|
||||
'findItemsByProduct'
|
||||
][service]
|
||||
][service] # Currently using only index 4, i.e., service = 4
|
||||
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
|
||||
|
||||
# examples of additional params you may want to add:
|
||||
@ -28,45 +28,11 @@ class FindingApi:
|
||||
# 'itemFilter(1).name':'ListingType'
|
||||
# 'itemFilter(1).value':'AuctionWithBIN'
|
||||
|
||||
def update_cats(self):
|
||||
|
||||
parent_cats = ['3034', '93427']
|
||||
cat_list = []
|
||||
|
||||
for department in parent_cats:
|
||||
|
||||
params = {
|
||||
"callname":"GetCategoryInfo",
|
||||
"appid":cfg.sec['SECURITY-APPNAME'],
|
||||
"version":"671",
|
||||
"responseencoding":"JSON",
|
||||
"CategoryID":department,
|
||||
"IncludeSelector":"ChildCategories",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
print('connection error')
|
||||
|
||||
response = response.json()
|
||||
response = response['CategoryArray']['Category'][1:]
|
||||
temp_cat_list = [cat['CategoryID'] for cat in response]
|
||||
cat_list.extend(temp_cat_list)
|
||||
|
||||
with open('cat_list.txt', 'w') as f:
|
||||
json.dump(cat_list, f)
|
||||
|
||||
# leaf_list = [node['LeafCategory'] for node in response]
|
||||
return cat_list
|
||||
|
||||
def get_data(self, category_id, i):
|
||||
|
||||
'''
|
||||
Gets raw JSON data fom FindingApi service call
|
||||
Currently being used to get itemIDs from categories
|
||||
Gets raw JSON data fom FindingApi service call. Currently being used to
|
||||
get itemIDs from categories;
|
||||
'''
|
||||
|
||||
params = {
|
||||
@ -144,6 +110,45 @@ class ShoppingApi:
|
||||
Creates objects from ShoppingApi service calls that can interact with
|
||||
pandas dataframes
|
||||
'''
|
||||
|
||||
def update_cats(self):
|
||||
'''
|
||||
Updates cat_list.txt
|
||||
'''
|
||||
|
||||
parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
|
||||
cat_list = []
|
||||
|
||||
for department in parent_cats:
|
||||
|
||||
params = {
|
||||
"callname":"GetCategoryInfo",
|
||||
"appid":cfg.sec['SECURITY-APPNAME'],
|
||||
"version":"671",
|
||||
"responseencoding":"JSON",
|
||||
"CategoryID":department,
|
||||
"IncludeSelector":"ChildCategories",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
print('connection error')
|
||||
|
||||
response = response.json()
|
||||
response = response['CategoryArray']['Category'][1:] # excludes index
|
||||
# 0 as this is parent node, i.e., women's or men's dept.
|
||||
|
||||
temp_cat_list = [cat['CategoryID'] for cat in response]
|
||||
cat_list.extend(temp_cat_list)
|
||||
|
||||
with open('cat_list.txt', 'w') as f:
|
||||
json.dump(cat_list, f)
|
||||
|
||||
# leaf_list = [node['LeafCategory'] for node in response]
|
||||
|
||||
def get_item_from_findItemsByCategory(self, twenty_id):
|
||||
'''
|
||||
Gets raw JSON data from multiple live listings given multiple itemIds
|
||||
@ -157,7 +162,6 @@ class ShoppingApi:
|
||||
"IncludeSelector":"ItemSpecifics",
|
||||
}
|
||||
|
||||
# TODO Add try excepts here
|
||||
try:
|
||||
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
|
||||
response.raise_for_status()
|
||||
@ -175,21 +179,33 @@ class ShoppingApi:
|
||||
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
|
||||
data for creating training sets
|
||||
'''
|
||||
data = []
|
||||
try:
|
||||
with open('raw_data.txt') as f:
|
||||
data = json.load(f)
|
||||
except (FileNotFoundError, ValueError):
|
||||
data = []
|
||||
finding = FindingApi(4, 2) # TODO replace these test values before production
|
||||
item_id_results = finding.get_ids_from_cats()
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
|
||||
# print(future)
|
||||
for item in future:
|
||||
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
|
||||
# data.update(future)
|
||||
# TODO save data here. You'll use this with your curate data class. SAve this as text file
|
||||
with open('raw_data.txt', 'w') as f:
|
||||
json.dump(data, f)
|
||||
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
|
||||
|
||||
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
|
||||
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
|
||||
|
||||
# NOTE:
|
||||
|
||||
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
|
||||
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
|
||||
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
||||
# More than enough data for your dataset.
|
||||
|
||||
|
||||
class CurateData:
|
||||
'''
|
||||
Contains methods for curating data for machine learning training sets;
|
||||
@ -199,7 +215,8 @@ class CurateData:
|
||||
|
||||
def import_raw(self):
|
||||
'''
|
||||
imports raw response json from local file
|
||||
imports raw response json from local file. This is data from
|
||||
GetMultipleItems call in ShoppingApi
|
||||
'''
|
||||
with open('raw_data.txt') as f:
|
||||
raw_data = json.load(f)
|
||||
@ -212,7 +229,7 @@ class CurateData:
|
||||
'''
|
||||
to_json = json.dumps(raw_data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df
|
||||
return raw_df # TODO save csv here?
|
||||
|
||||
def to_training(self, raw_data): # NOTE need to create copies not views
|
||||
'''
|
||||
@ -293,20 +310,24 @@ class CurateData:
|
||||
dropd = nvl_training.drop(col_drop, axis=1)
|
||||
return dropd
|
||||
|
||||
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
|
||||
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
|
||||
|
||||
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
|
||||
|
||||
def expand_nvlclass(self, class_training, dropd):
|
||||
'''
|
||||
takes image url list from each cell and expands them into separate/duplicate
|
||||
instances. Modifies both class training and dropd dfs. Appends custom
|
||||
image url dict {'source':'target'}.
|
||||
'''
|
||||
expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
|
||||
expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
|
||||
expanded_class = expanded_class.dropna(subset=['PictureURL'])
|
||||
expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
|
||||
|
||||
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
|
||||
expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
|
||||
expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
@ -325,7 +346,9 @@ class CurateData:
|
||||
with open('temp_pics_source_list.txt', 'w') as f:
|
||||
json.dump(temp_pics_source_list, f)
|
||||
|
||||
# TODO still need to save these as csv files
|
||||
expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8', header=False)
|
||||
# TODO open csv here, drop duplicates and save again unless there's a better way
|
||||
expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8', header=False)
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
def dl_pictures(self, *args):
|
||||
@ -431,16 +454,3 @@ def main():
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
|
||||
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
|
||||
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
||||
# More than enough data for your dataset.
|
||||
|
||||
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
|
||||
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
|
||||
|
||||
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
|
||||
# TO AVOID HICCUPS WHEN CREATING DATASET
|
||||
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
|
||||
|
||||
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
|
||||
|
Loading…
Reference in New Issue
Block a user