2021-01-23 06:21:56 +00:00
import importlib
import numpy as np
2020-11-09 01:47:03 +00:00
import concurrent . futures
2020-10-12 07:53:29 +00:00
import json
import requests
import pandas as pd
class FindingApi :
2020-11-09 01:47:03 +00:00
''' Methods for accessing eBays FindingApi services '''
2020-10-12 18:48:15 +00:00
def __init__ ( self , service , pageNumber ) :
2020-10-12 07:53:29 +00:00
self . service = [
' findItemsAdvanced ' , ' findCompletedItems ' ,
' findItemsByKeywords ' , ' findItemsIneBayStores ' , ' findItemsByCategory ' ,
' findItemsByProduct '
2020-10-12 18:48:15 +00:00
] [ service ]
self . pageNumber = list ( range ( 1 , pageNumber ) ) # 64 pages is recommended
2021-01-25 05:46:55 +00:00
# as this will give equal weights to cats given call constraints
2020-10-12 07:53:29 +00:00
# departments = ["3034","93427"] (womens and mens)
2020-11-12 20:22:51 +00:00
2021-01-25 05:46:55 +00:00
def get_data ( self , category_id , i ) :
2020-11-12 20:22:51 +00:00
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
Gets raw JSON data fom FindingApi service call
Currently being used to get itemIDs from categories
'''
params = {
" OPERATION-NAME " : self . service ,
" SECURITY-APPNAME " : " scottbea-xlister-PRD-6796e0ff6-14862949 " ,
" SERVICE-VERSION " : " 1.13.0 " ,
" RESPONSE-DATA-FORMAT " : " JSON " ,
" categoryId " : category_id ,
" paginationInput.entriesPerPage " : " 100 " ,
" paginationInput.PageNumber " : i
}
2020-12-25 19:15:20 +00:00
2020-11-12 20:22:51 +00:00
response = requests . get ( " https://svcs.ebay.com/services/search/FindingService/v1 " ,
2020-12-25 19:15:20 +00:00
params = params )
2020-11-12 20:22:51 +00:00
data = response . json ( )
return data
2020-10-12 07:53:29 +00:00
2020-12-25 19:15:20 +00:00
2020-10-12 07:53:29 +00:00
def get_ids_from_cats ( self ) :
2020-10-12 18:48:15 +00:00
'''
2020-10-18 00:22:45 +00:00
Creates a 20 - itemId list to use for the ShoppingApi
2020-10-12 18:48:15 +00:00
call
'''
2020-12-13 01:56:51 +00:00
pages = self . pageNumber
2020-10-12 07:53:29 +00:00
itemid_results_list = [ ]
2020-11-05 22:32:41 +00:00
2020-12-25 19:15:20 +00:00
with open ( ' cat_list.txt ' ) as jf :
cat_list = json . load ( jf )
2020-11-12 20:22:51 +00:00
for category_id in cat_list :
2020-12-25 19:15:20 +00:00
args = [ ( category_id , i ) for i in pages ]
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-25 19:15:20 +00:00
for future in executor . map ( lambda p : self . get_data ( * p ) , args ) :
2020-11-12 20:22:51 +00:00
data = future
2020-12-29 07:20:55 +00:00
try : # TODO if conditions are not working due to each thread checking the same unedited item_id_results list
2020-11-12 20:22:51 +00:00
training = pd . read_csv ( ' training.csv ' )
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if ( item not in training . values ) and ( item not in itemid_results_list ) :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-11-12 20:22:51 +00:00
except ( pd . errors . EmptyDataError , FileNotFoundError ) :
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if item not in itemid_results_list :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-12-28 04:13:12 +00:00
item_id_results = list ( set ( itemid_results_list ) )
item_id_results = [ ' , ' . join ( itemid_results_list [ n : n + 20 ] ) for n in list ( range ( 0 ,
len ( itemid_results_list ) , 20 ) ) ]
2020-10-12 07:53:29 +00:00
return item_id_results
2021-01-25 05:46:55 +00:00
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
2020-12-25 19:15:20 +00:00
# for category_id in cat_list:
2020-10-18 07:08:04 +00:00
class ShoppingApi :
2020-10-12 07:53:29 +00:00
'''
2020-10-12 18:48:15 +00:00
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
2020-10-12 07:53:29 +00:00
'''
2020-11-12 20:22:51 +00:00
def get_item_from_findItemsByCategory ( self , twenty_id ) :
2020-10-18 00:22:45 +00:00
'''
2020-10-18 20:56:16 +00:00
Gets raw JSON data from multiple live listings given multiple itemIds
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
params = {
" callname " : " GetMultipleItems " ,
" appid " : " scottbea-xlister-PRD-6796e0ff6-14862949 " ,
" version " : " 671 " ,
" responseencoding " : " JSON " ,
" ItemID " : twenty_id ,
" IncludeSelector " : " ItemSpecifics " ,
}
response = requests . get ( " https://open.api.ebay.com/shopping? " , params = params )
response = response . json ( )
2020-12-13 01:56:51 +00:00
response = response [ ' Item ' ]
2020-11-12 20:22:51 +00:00
return response
def conky ( self ) :
2020-12-13 01:56:51 +00:00
'''
For some reason item_id_results can only be passed as argument in executor . map
if the variable is made within function
'''
2020-12-29 08:55:21 +00:00
data = [ ] # TODO I think you need to append a list of dictionaries rather than update a dictionary of dictionaries. Training var will require an updated dictionary though
2020-12-13 01:56:51 +00:00
finding = FindingApi ( 4 , 2 )
item_id_results = finding . get_ids_from_cats ( )
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-13 01:56:51 +00:00
for future in executor . map ( self . get_item_from_findItemsByCategory , item_id_results ) :
2020-12-29 08:55:21 +00:00
# print(future)
for item in future :
data . append ( item ) # The end result should be a list of dicts where each dict in the list is a listing
2020-12-29 07:20:55 +00:00
# data.update(future)
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
2021-01-23 06:21:56 +00:00
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
2020-10-18 00:22:45 +00:00
class CurateData :
'''
2020-11-09 01:47:03 +00:00
Contains functions for curating data for machine learning training sets ;
Takes item in data from ShoppingApi request as argument and extracts / creates key
value pairs that gets updated to custom dataframe used in Ml training sets .
2020-10-18 00:22:45 +00:00
'''
2020-11-09 01:47:03 +00:00
2021-01-27 06:01:00 +00:00
def import_raw ( self ) :
with open ( ' raw_data.txt ' ) as f :
raw_data = json . load ( f )
return raw_data
2020-11-09 01:47:03 +00:00
2021-01-23 06:21:56 +00:00
def data_frame ( self , data ) :
to_json = json . dumps ( data )
raw_df = pd . read_json ( to_json )
return raw_df
2021-01-25 05:46:55 +00:00
2021-01-27 06:01:00 +00:00
def to_training ( self , data ) :
2021-01-25 05:46:55 +00:00
raw_df = self . data_frame ( data )
interm_df1 = raw_df . loc [ : , [ ' ItemID ' , ' PictureURL ' , ' PrimaryCategoryID ' , ' PrimaryCategoryName ' , ' Title ' , ' ItemSpecifics ' ] ]
2021-01-27 06:01:00 +00:00
interm_df1 [ [ ' ItemID ' , ' PrimaryCAegoryID ' ] ] = interm_df1 [ [ ' ItemID ' , ' PrimaryCategoryID ' ] ] . astype ( str )
training = interm_df1
return training
def nvl_dict ( self , training ) :
interm_df1 = pd . Series ( training . ItemSpecifics )
interm_df1 = interm_df1 . apply ( lambda x : x [ ' NameValueList ' ] )
nvl_dict = interm_df1 . apply ( lambda x : { k : v for ( k , v ) in zip ( [ n [ ' Name ' ] for n in x ] , [ v [ ' Value ' ] for v in x ] ) } )
return nvl_dict
def update_df ( self , data ) :
'''
Creates training instances for dataset . picture_url_list expanded to
max available pictures with each picture url corresponding to features
in common with same listing ( i . e . , because there are multiple pictures
per listing , each picture will be its own training instance .
'''
# USE combination of apply() and dict comprehension to extract your custom nvl_dict from nvl in each cell
# USE training.apply(func, axis= something) to create your custom nvl_dict for each cell
2021-01-25 05:46:55 +00:00
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
2021-01-27 06:01:00 +00:00
# USE pd.merge() at some point...possibly after expanding lists and nvl
2021-01-25 05:46:55 +00:00
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
# df. then you can take the raw_df and combine it with the old raw_df for backup.
2021-01-23 08:38:04 +00:00
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
# the other for updating your training set.
2021-01-23 06:21:56 +00:00
2020-10-12 07:53:29 +00:00
2020-10-18 00:22:45 +00:00
def main ( ) :
'''
Main program creates / updates a csv file to use for ML training from live
ebay listings
'''
2021-01-25 05:46:55 +00:00
pass
# main goes here:
2020-10-12 07:53:29 +00:00
2020-10-18 00:22:45 +00:00
if __name__ == " __main__ " :
main ( )
2020-11-09 01:47:03 +00:00
2020-10-12 07:53:29 +00:00
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
2021-01-27 06:01:00 +00:00
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
2020-11-09 01:47:03 +00:00
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
2020-12-13 01:56:51 +00:00
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.