ebay-ml-lister/ebay_api.py

289 lines
16 KiB
Python

import importlib
import numpy as np
import concurrent.futures
import json
import requests
import pandas as pd
import config as cfg
class FindingApi:
'''Methods for accessing eBays FindingApi services'''
def __init__(self, service, pageNumber):
self.service = [
'findItemsAdvanced', 'findCompletedItems',
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
'findItemsByProduct'
][service]
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
# departments = ["3034","93427"] (womens and mens)
# examples of additional params you may want to add:
# 'itemFilter(0).value':'Used'
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
def get_data(self, category_id, i):
'''
Gets raw JSON data fom FindingApi service call
Currently being used to get itemIDs from categories
'''
params = {
"OPERATION-NAME":self.service,
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"100",
"paginationInput.PageNumber":i
}
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
params=params)
data = response.json()
return data
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
# it to the first 5 pictures instead of random.
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
# to see which one is more accurate or if a combo of both is more accurate.
def get_ids_from_cats(self):
'''
Creates a 20-itemId list to use for the ShoppingApi
call
'''
pages = self.pageNumber
itemid_results_list = []
with open('cat_list.txt') as jf:
cat_list = json.load(jf)
for category_id in cat_list:
args = [(category_id, i) for i in pages]
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(lambda p: self.get_data(*p), args):
data = future
try: # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list
training = pd.read_csv('training.csv')
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
if (item not in training.values) and (item not in itemid_results_list):
itemid_results_list.append(item['itemId'][0])
except (pd.errors.EmptyDataError, FileNotFoundError):
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
if item not in itemid_results_list:
itemid_results_list.append(item['itemId'][0])
item_id_results = list(set(itemid_results_list))
item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
len(itemid_results_list), 20))]
return item_id_results
# TODO during your try except conditionals just check the csv files. At the end you can create sets. You can creat another condition that says if the final set is smaller than 100k then you can call finding
# service on more pages (but only pages you haven't tried) and repeat the search process.
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
# for category_id in cat_list:
class ShoppingApi:
'''
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
'''
def get_item_from_findItemsByCategory(self, twenty_id):
'''
Gets raw JSON data from multiple live listings given multiple itemIds
'''
params = {
"callname":"GetMultipleItems",
"appid":config.cfg['SECURITY-APPNAME'],
"version":"671",
"responseencoding":"JSON",
"ItemID":twenty_id,
"IncludeSelector":"ItemSpecifics",
}
response = requests.get("https://open.api.ebay.com/shopping?", params=params)
response = response.json()
response = response['Item']
return response
def conky(self):
'''
For some reason item_id_results can only be passed as argument in executor.map
if the variable is made within function
'''
data = [] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
finding = FindingApi(4, 2)
item_id_results = finding.get_ids_from_cats()
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(self.get_item_from_findItemsByCategory, item_id_results):
# print(future)
for item in future:
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
# data.update(future)
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
class CurateData:
'''
Contains functions for curating data for machine learning training sets;
Takes item in data from ShoppingApi request as argument and extracts/ creates key
value pairs that gets updated to custom dataframe used in Ml training sets.
'''
def import_raw(self):
with open('raw_data.txt') as f:
raw_data = json.load(f)
return raw_data
def data_frame(self, data):
to_json = json.dumps(data)
raw_df = pd.read_json(to_json)
return raw_df
def to_training(self, data):
raw_df = self.data_frame(data)
interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
training = interm_df1
return training
def class_training(self, training):
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
return class_training
def nvl_training(self, training):
interm_df1 = pd.Series(training.ItemSpecifics)
interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
nvl_df = pd.json_normalize(nvl_dict)
nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
return nvl_training
def extract_contents(self, df):
extracted_df = df.applymap(lambda x: ' '.join(x) if pd.notnull(x) else np.nan) # TODO does it work?
def drop_nvl_cols(self, nvl_training):
col_drop = [
'Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item',
'Modification Description', 'Article Type', 'Customized',
'Character', 'Features', 'Colors', 'Shade', 'Product ID',
'Personalized', 'Platform Height', 'Year Manufactured',
'Trim Material', 'Fashion Element', 'Shaft Material',
'Character Family', 'Heel to Toe Drop', 'Custom Bundle',
'California Prop 65 Warning', 'Manufacturer Color', 'Main Color',
'Collection', 'Midsole Type', 'Signed', 'US Shoe Size (Men#!#s)',
'Calf Circumference', 'Handmade', 'Safety Standards',
'Customised', 'Cleat Type', 'Cushioning Level', 'AU Shoe Size',
'Country/Region of Manufacture', 'Type of Sport', 'Main Colour',
'Look', 'Sole Type', 'Sole Manufacturer Colour', 'Sole Material',
'Toe Material', 'Feature', 'Length', 'Width', 'Size Chart',
'Boot Height', 'Water Resistance Level', 'Material Composition',
'Calf Width', 'Insole Material', 'UPC', 'Size Type'
]
col_keep = [
'Picture URL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Departement',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Upper Material',
]
# May be no difference between Product type and sub style; fastening and
# closure; toe shape and toe type; occasion and performance/activity;
# see if you can combine these somehow (you may not want this though).
# Also consider keeping only cols that have plenty of values
user_input = input('drop or keep cols?:')
if 'keep' in user_input:
dropd_nvl_training = nvl_training.loc[:,[col_keep]]
else:
dropd_nvl_training = nvl_training.drop(col_drop, axis=1)
return dropd_nvl_training
def combine_nvlclass(self, class_training, dropd_nvl_training):
final_training = pd.concat([class_training, dropd_nvl_training], axis=1)
return final_training
# TODO Still need to to extract strings from list of strings and then drop which ones you don't want or vice versa. You may have to avoid using cells with lists of strings longer than one (e.g., 'Features')
# TODO Also need to expand photo list from PictureURL. Decide how many or which photos to use. You may even want to use a pretrained model to decide whether or not the photos are of shoes or not to filter#
# it might be that only the first picture is reliable enough to use in the dataset.
# TODO also need to decide which features are going to be relevant. For example, is color really necessary for finding features? is it necessary to train your model on this or can you find color an easier way?
def update_df(self, data):
'''
Creates training instances for dataset. picture_url_list expanded to
max available pictures with each picture url corresponding to features
in common with same listing (i.e., because there are multiple pictures
per listing, each picture will be its own training instance.
'''
pass
# Ultimately you need each record to be one picture url as input and relevant columns determined from custom nvl_dicts. You can figure out how you need to address the multiple values in the lists when you make the df just before the final df (this one may include the multiple pictures from each list in the original records. This should be your next step).
# Considering the above, you need to figure out how to expand the url list while keeping the nvl_df intact
# So, before you can do the above two comments you should first figure out what kind of format you will need your df to be in for training. You require multilabel/multiclass(?)...consult that one article on identifying rainforests, and also hands on machine learning with blah blah. Also consult HPP for combining dfs efficiently.
# USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell
# USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
# USE pd.merge() at some point...possibly after expanding lists and nvl. consult HPP book for a more efficient way to combine dfs.
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
# df. then you can take the raw_df and combine it with the old raw_df for backup.
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
# the other for updating your training set.
def main():
'''
Main program creates/updates a csv file to use for ML training from live
ebay listings
'''
pass
# main goes here:
if __name__ == "__main__":
main()
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
'''
List of columns from nvl_list that I want to drop before training:
['Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item', 'Modification Description', 'Article Type', 'Customized', 'Character', 'Features', 'Colors', 'Shade', 'Product ID', 'Personlized', 'Platform Height',
'Year Manufactured', 'Trim Material', 'Fashion Element', 'Shaft Material', 'Character Family', 'Heel to Toe Drop', 'Custom Bundle', 'Califormnia Prop 65 Warning', 'Manufacture Color', 'Main Color', A
'Collection', 'Mid Sole Type', 'Signed', 'US Shoe Size (Men#!#s)', 'Calf Circumference', 'Hand Made', 'Safety Standards', 'Customised', 'Cleat Type', 'Cushioning Level', 'AU Shoe Size', 'Country/Region of Manufacture',
'Type of Sport', 'Main Colour', 'Look']
'''
'''
list of columns from nvl_list that I want to keep before training:
[
'Picture URL', 'Style', 'Department', 'Type', 'Gender', 'Closure', 'Performance/Activity',
'Accents', 'Occasion', 'Toe Shape', 'Pattern', 'Activity',
'Heel Style', 'Fastening', 'Heel Type', 'Toe Type', 'Closure Type', 'Departement',
'Product Type', 'Sub Style', 'Season', 'Theme', 'Material', 'Upper Material',
]
'''
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)