176 lines
8.1 KiB
Python
176 lines
8.1 KiB
Python
import importlib
|
|
import numpy as np
|
|
import concurrent.futures
|
|
import json
|
|
import requests
|
|
import pandas as pd
|
|
|
|
class FindingApi:
|
|
'''Methods for accessing eBays FindingApi services'''
|
|
def __init__(self, service, pageNumber):
|
|
self.service = [
|
|
'findItemsAdvanced', 'findCompletedItems',
|
|
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
|
|
'findItemsByProduct'
|
|
][service]
|
|
self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended
|
|
# as this will give equal weights to cats given call constraints
|
|
|
|
# departments = ["3034","93427"] (womens and mens)
|
|
|
|
def get_data(self, category_id, i):
|
|
|
|
'''
|
|
Gets raw JSON data fom FindingApi service call
|
|
Currently being used to get itemIDs from categories
|
|
'''
|
|
|
|
params = {
|
|
"OPERATION-NAME":self.service,
|
|
"SECURITY-APPNAME":"scottbea-xlister-PRD-6796e0ff6-14862949",
|
|
"SERVICE-VERSION":"1.13.0",
|
|
"RESPONSE-DATA-FORMAT":"JSON",
|
|
"categoryId":category_id,
|
|
"paginationInput.entriesPerPage":"100",
|
|
"paginationInput.PageNumber":i
|
|
}
|
|
|
|
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
|
|
params=params)
|
|
|
|
data = response.json()
|
|
return data
|
|
|
|
|
|
def get_ids_from_cats(self):
|
|
'''
|
|
Creates a 20-itemId list to use for the ShoppingApi
|
|
call
|
|
'''
|
|
pages = self.pageNumber
|
|
itemid_results_list = []
|
|
|
|
with open('cat_list.txt') as jf:
|
|
cat_list = json.load(jf)
|
|
|
|
for category_id in cat_list:
|
|
|
|
args = [(category_id, i) for i in pages]
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(lambda p: self.get_data(*p), args):
|
|
data = future
|
|
|
|
try: # TODO if conditions are not working due to each thread checking the same unedited item_id_results list
|
|
training = pd.read_csv('training.csv')
|
|
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
|
|
if (item not in training.values) and (item not in itemid_results_list):
|
|
itemid_results_list.append(item['itemId'][0])
|
|
|
|
except (pd.errors.EmptyDataError, FileNotFoundError):
|
|
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
|
|
if item not in itemid_results_list:
|
|
itemid_results_list.append(item['itemId'][0])
|
|
|
|
item_id_results = list(set(itemid_results_list))
|
|
item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
|
|
len(itemid_results_list), 20))]
|
|
return item_id_results
|
|
|
|
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
|
|
# for category_id in cat_list:
|
|
|
|
class ShoppingApi:
|
|
'''
|
|
Creates objects from ShoppingApi service calls that can interact with
|
|
pandas dataframes
|
|
'''
|
|
def get_item_from_findItemsByCategory(self, twenty_id):
|
|
'''
|
|
Gets raw JSON data from multiple live listings given multiple itemIds
|
|
'''
|
|
params = {
|
|
"callname":"GetMultipleItems",
|
|
"appid":"scottbea-xlister-PRD-6796e0ff6-14862949",
|
|
"version":"671",
|
|
"responseencoding":"JSON",
|
|
"ItemID":twenty_id,
|
|
"IncludeSelector":"ItemSpecifics",
|
|
}
|
|
|
|
response = requests.get("https://open.api.ebay.com/shopping?", params=params)
|
|
response = response.json()
|
|
response = response['Item']
|
|
return response
|
|
|
|
def conky(self):
|
|
'''
|
|
For some reason item_id_results can only be passed as argument in executor.map
|
|
if the variable is made within function
|
|
'''
|
|
data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
|
|
finding = FindingApi(4, 2)
|
|
item_id_results = finding.get_ids_from_cats()
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
|
|
# print(future)
|
|
for item in future:
|
|
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
|
|
# data.update(future)
|
|
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
|
|
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
|
|
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
|
|
class CurateData:
|
|
'''
|
|
Contains functions for curating data for machine learning training sets;
|
|
Takes item in data from ShoppingApi request as argument and extracts/ creates key
|
|
value pairs that gets updated to custom dataframe used in Ml training sets.
|
|
'''
|
|
|
|
def update_df(self, data):
|
|
'''
|
|
Creates training instances for dataset. picture_url_list expanded to
|
|
max available pictures with each picture url corresponding to features
|
|
in common with same listing (i.e., because there are multiple pictures
|
|
per listing, each picture will be its own training instance.
|
|
'''
|
|
|
|
def data_frame(self, data):
|
|
to_json = json.dumps(data)
|
|
raw_df = pd.read_json(to_json)
|
|
return raw_df
|
|
|
|
def to_training(self):
|
|
raw_df = self.data_frame(data)
|
|
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
|
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str)
|
|
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
|
|
|
|
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
|
|
# df. then you can take the raw_df and combine it with the old raw_df for backup.
|
|
|
|
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
|
|
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
|
|
# the other for updating your training set.
|
|
|
|
|
|
def main():
|
|
'''
|
|
Main program creates/updates a csv file to use for ML training from live
|
|
ebay listings
|
|
'''
|
|
pass
|
|
# main goes here:
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
|
|
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
|
|
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
|
# More than enough data for your dataset.
|
|
|
|
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
|
|
# TO AVOID HICCUPS WHEN CREATING DATASET
|
|
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
|