2021-01-23 06:21:56 +00:00
import importlib
import numpy as np
2020-11-09 01:47:03 +00:00
import concurrent . futures
2020-10-12 07:53:29 +00:00
import json
import requests
import pandas as pd
class FindingApi :
2020-11-09 01:47:03 +00:00
''' Methods for accessing eBays FindingApi services '''
2020-10-12 18:48:15 +00:00
def __init__ ( self , service , pageNumber ) :
2020-10-12 07:53:29 +00:00
self . service = [
' findItemsAdvanced ' , ' findCompletedItems ' ,
' findItemsByKeywords ' , ' findItemsIneBayStores ' , ' findItemsByCategory ' ,
' findItemsByProduct '
2020-10-12 18:48:15 +00:00
] [ service ]
self . pageNumber = list ( range ( 1 , pageNumber ) ) # 64 pages is recommended
2020-11-05 22:32:41 +00:00
# this will give equal weights to cats given call constraints
2020-10-12 07:53:29 +00:00
# departments = ["3034","93427"] (womens and mens)
2020-11-12 20:22:51 +00:00
2020-12-25 19:15:20 +00:00
def get_data ( self , category_id , i ) : # TODO you're going to have to use nested functions of lambda functions here somewhere
2020-11-12 20:22:51 +00:00
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
Gets raw JSON data fom FindingApi service call
Currently being used to get itemIDs from categories
'''
params = {
" OPERATION-NAME " : self . service ,
" SECURITY-APPNAME " : " scottbea-xlister-PRD-6796e0ff6-14862949 " ,
" SERVICE-VERSION " : " 1.13.0 " ,
" RESPONSE-DATA-FORMAT " : " JSON " ,
" categoryId " : category_id ,
" paginationInput.entriesPerPage " : " 100 " ,
" paginationInput.PageNumber " : i
}
2020-12-25 19:15:20 +00:00
2020-11-12 20:22:51 +00:00
response = requests . get ( " https://svcs.ebay.com/services/search/FindingService/v1 " ,
2020-12-25 19:15:20 +00:00
params = params )
2020-11-12 20:22:51 +00:00
data = response . json ( )
return data
2020-10-12 07:53:29 +00:00
2020-12-25 19:15:20 +00:00
2020-10-12 07:53:29 +00:00
def get_ids_from_cats ( self ) :
2020-10-12 18:48:15 +00:00
'''
2020-10-18 00:22:45 +00:00
Creates a 20 - itemId list to use for the ShoppingApi
2020-10-12 18:48:15 +00:00
call
'''
2020-12-13 01:56:51 +00:00
pages = self . pageNumber
2020-10-12 07:53:29 +00:00
itemid_results_list = [ ]
2020-11-05 22:32:41 +00:00
2020-12-25 19:15:20 +00:00
with open ( ' cat_list.txt ' ) as jf :
cat_list = json . load ( jf )
2020-11-12 20:22:51 +00:00
for category_id in cat_list :
2020-12-25 19:15:20 +00:00
args = [ ( category_id , i ) for i in pages ]
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-25 19:15:20 +00:00
for future in executor . map ( lambda p : self . get_data ( * p ) , args ) :
2020-11-12 20:22:51 +00:00
data = future
2020-12-29 07:20:55 +00:00
try : # TODO if conditions are not working due to each thread checking the same unedited item_id_results list
2020-11-12 20:22:51 +00:00
training = pd . read_csv ( ' training.csv ' )
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if ( item not in training . values ) and ( item not in itemid_results_list ) :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-11-12 20:22:51 +00:00
except ( pd . errors . EmptyDataError , FileNotFoundError ) :
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if item not in itemid_results_list :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-12-28 04:13:12 +00:00
item_id_results = list ( set ( itemid_results_list ) )
item_id_results = [ ' , ' . join ( itemid_results_list [ n : n + 20 ] ) for n in list ( range ( 0 ,
len ( itemid_results_list ) , 20 ) ) ]
2020-10-12 07:53:29 +00:00
return item_id_results
2020-12-25 19:15:20 +00:00
## TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
# for category_id in cat_list:
2020-10-18 07:08:04 +00:00
class ShoppingApi :
2020-10-12 07:53:29 +00:00
'''
2020-10-12 18:48:15 +00:00
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
2020-10-12 07:53:29 +00:00
'''
2020-11-12 20:22:51 +00:00
def get_item_from_findItemsByCategory ( self , twenty_id ) :
2020-10-18 00:22:45 +00:00
'''
2020-10-18 20:56:16 +00:00
Gets raw JSON data from multiple live listings given multiple itemIds
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
params = {
" callname " : " GetMultipleItems " ,
" appid " : " scottbea-xlister-PRD-6796e0ff6-14862949 " ,
" version " : " 671 " ,
" responseencoding " : " JSON " ,
" ItemID " : twenty_id ,
" IncludeSelector " : " ItemSpecifics " ,
}
response = requests . get ( " https://open.api.ebay.com/shopping? " , params = params )
response = response . json ( )
2020-12-13 01:56:51 +00:00
response = response [ ' Item ' ]
2020-11-12 20:22:51 +00:00
return response
def conky ( self ) :
2020-12-13 01:56:51 +00:00
'''
For some reason item_id_results can only be passed as argument in executor . map
if the variable is made within function
'''
2020-12-29 08:55:21 +00:00
data = [ ] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
2020-12-13 01:56:51 +00:00
finding = FindingApi ( 4 , 2 )
item_id_results = finding . get_ids_from_cats ( )
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-13 01:56:51 +00:00
for future in executor . map ( self . get_item_from_findItemsByCategory , item_id_results ) :
2020-12-29 08:55:21 +00:00
# print(future)
for item in future :
data . append ( item ) # The end result should be a list of dicts where each dict in the list is a listing
2020-12-29 07:20:55 +00:00
# data.update(future)
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
2021-01-23 06:21:56 +00:00
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
2020-10-18 00:22:45 +00:00
class CurateData :
'''
2020-11-09 01:47:03 +00:00
Contains functions for curating data for machine learning training sets ;
Takes item in data from ShoppingApi request as argument and extracts / creates key
value pairs that gets updated to custom dataframe used in Ml training sets .
2020-10-18 00:22:45 +00:00
'''
2020-11-09 01:47:03 +00:00
def extract_itemId ( self , item ) :
item_id = { ' ItemID ' : item [ ' ItemID ' ] }
return item_id
def extract_catId ( self , item ) :
catId = { ' PrimaryCategoryID ' : item [ ' PrimaryCategoryID ' ] }
return catId
def extract_prime_cat_name ( self , item ) :
prime_cat_name = { ' PrimaryCategoryName ' : item [ ' PrimaryCategoryName ' ] }
return prime_cat_name
def extract_picture_url ( self , item ) :
'''
Only pulls PictureURL list and does not
create dictionary
'''
picture_url_list = item [ ' PictureURL ' ]
return picture_url_list
def extract_nvl ( self , item ) :
names = [ ]
values = [ ]
2020-12-29 08:55:21 +00:00
nvl = item [ ' ItemSpecifics ' ] [ ' NameValueList ' ]
2020-11-09 01:47:03 +00:00
for nvl_dict in nvl :
2020-12-29 08:55:21 +00:00
names . append ( nvl_dict [ ' Name ' ] )
values . append ( nvl_dict [ ' Value ' ] )
2020-11-09 01:47:03 +00:00
nvl_dict = dict ( zip ( names , values ) )
return nvl_dict
2020-10-18 00:22:45 +00:00
def update_df ( self , data ) :
2020-10-18 22:32:17 +00:00
'''
2020-11-09 01:47:03 +00:00
Creates training instances for dataset . picture_url_list expanded to
max available pictures with each picture url corresponding to features
in common with same listing ( i . e . , because there are multiple pictures
per listing , each picture will be its own training instance .
2020-10-18 22:32:17 +00:00
'''
2020-11-09 01:47:03 +00:00
2020-12-13 01:56:51 +00:00
training = { }
for item in data :
# TODO MAY HAVE TO DISCARD THIS IDEA DUE TO CRAPPY PICTURES OF CLOSEUPDS AND TAGS. may have to settle for first picture which is likely to contain more accurate representation of item.
2020-11-09 01:47:03 +00:00
picture_url_list = self . extract_picture_url ( item )
'''
Creates same training instance per photo for
'''
2021-01-23 06:21:56 +00:00
# for url in picture_url_list: # maybe try removing for loop to see if csv updates correctly here
# remote_url = {'PictureURL':url}
# training.update(remote_url)
item_id = self . extract_itemId ( item )
training . update ( item_id )
catId = self . extract_catId ( item )
training . update ( catId )
prime_cat_name = self . extract_prime_cat_name ( item )
training . update ( prime_cat_name )
nvl_dict = self . extract_nvl ( item )
training . update ( nvl_dict )
df = pd . json_normalize ( training ) # TODO FIX INDENT HERE?
#df.to_csv('training.csv', mode='a')
print ( training ) # after looking at the training output it looks like everything might be out of order due possibly to multithreading issues. Due to this you may have to use a more finegrained
# multithreading module
def data_frame ( self , data ) :
to_json = json . dumps ( data )
raw_df = pd . read_json ( to_json )
return raw_df
2021-01-23 08:38:04 +00:00
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
# the other for updating your training set.
2021-01-23 06:21:56 +00:00
2020-10-12 07:53:29 +00:00
2020-10-18 00:22:45 +00:00
def main ( ) :
'''
Main program creates / updates a csv file to use for ML training from live
ebay listings
'''
2020-12-29 07:20:55 +00:00
# service, pageNumber = input('service and pageNumber:').split()
# service = int(service)
# pageNumber = int(pageNumber)
# finding = FindingApi(service, pageNumber)
2020-11-09 01:47:03 +00:00
2020-12-29 07:20:55 +00:00
# item_id_results = finding.get_ids_from_cats()
2020-10-18 00:22:45 +00:00
shopping = ShoppingApi ( )
2020-11-12 20:22:51 +00:00
data = shopping . conky ( )
2020-10-18 00:22:45 +00:00
curate = CurateData ( )
curate . update_df ( data )
2020-12-29 08:55:21 +00:00
return data
2020-10-12 07:53:29 +00:00
2020-10-18 00:22:45 +00:00
if __name__ == " __main__ " :
main ( )
2020-11-09 01:47:03 +00:00
2020-10-12 07:53:29 +00:00
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
2020-10-17 23:21:11 +00:00
# Need to make sure dataframe gets important stuff outside of nvl in order to
# access values for cross referencing itemIds from calls
# Need to decide if list gets accessed from df or if you're just going to have
# list contents extracted and possibly placed into separate cells/labels
2020-11-09 01:47:03 +00:00
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
2020-12-13 01:56:51 +00:00
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.