529 lines
22 KiB
Python
529 lines
22 KiB
Python
import os
|
|
from time import sleep
|
|
from random import randint
|
|
import scrape_ids
|
|
from datetime import datetime, timedelta
|
|
import dateutil
|
|
from dateutil import parser
|
|
import pytz
|
|
import pdb
|
|
from io import StringIO
|
|
import numpy as np
|
|
import concurrent.futures
|
|
import json
|
|
import requests
|
|
import pandas as pd
|
|
import config as cfg
|
|
import shutil
|
|
import re
|
|
|
|
from ebaysdk.exception import ConnectionError
|
|
from ebaysdk.trading import Connection as Trading
|
|
from ebaysdk.finding import Connection as Finding
|
|
from ebaysdk.shopping import Connection as Shopping
|
|
|
|
class FindingApi:
|
|
'''
|
|
Methods for accessing eBay's FindingApi services
|
|
'''
|
|
|
|
def __init__(self, service):
|
|
self.service = [
|
|
'findItemsAdvanced', 'findCompletedItems',
|
|
'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
|
|
'findItemsByProduct'
|
|
][service] # Currently using only index 4, i.e., service = 4
|
|
# examples of additional params you may want to add:
|
|
# 'itemFilter(0).value':'Used' consider using this with findCompletedItems call
|
|
# 'itemFilter(1).name':'ListingType'
|
|
# 'itemFilter(1).value':'AuctionWithBIN'
|
|
# 'StartTimeNewest'
|
|
# HideDuplicateItems
|
|
|
|
def get_data(self, category_id):
|
|
|
|
'''
|
|
Gets raw JSON data fom FindingApi service call. Currently being used to
|
|
get itemIDs from categories;
|
|
'''
|
|
# startTime = dateutil.parser.isoparse( startTime )
|
|
# now = datetime.datetime.now(tz=pytz.UTC)
|
|
# days_on_site = (now - startTime).days # as int
|
|
|
|
ids = []
|
|
params = {
|
|
"OPERATION-NAME":self.service,
|
|
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
|
|
"SERVICE-VERSION":"1.13.0",
|
|
"RESPONSE-DATA-FORMAT":"JSON",
|
|
"categoryId":category_id,
|
|
"paginationInput.entriesPerPage":"100",
|
|
"paginationInput.PageNumber":"1",
|
|
"itemFilter(0).name":"Condition",
|
|
"itemFilter(0).value":"Used",
|
|
"itemFilter.name":"HideDuplicateItems",
|
|
"itemFilter.value":"true",
|
|
"sortOrder":"StartTimeNewest",
|
|
}
|
|
# "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here
|
|
# "itemFilter(1).value":"true"
|
|
|
|
try:
|
|
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
|
|
params=params, timeout=24)
|
|
response.raise_for_status()
|
|
|
|
except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
|
|
print('connection error')
|
|
return ids
|
|
try:
|
|
data = response.json()
|
|
for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
|
|
ids.append(item['itemId'][0])
|
|
|
|
ids = list(set(ids))
|
|
|
|
except (AttributeError, KeyError):
|
|
print('AttributeError or KeyError. Exiting')
|
|
print(response.json())
|
|
return ids
|
|
|
|
return ids
|
|
|
|
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
|
|
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
|
|
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
|
|
# it to the first 5 pictures instead of random.
|
|
|
|
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
|
|
# to see which one is more accurate or if a combo of both is more accurate.
|
|
|
|
def get_ids_from_cats(self):
|
|
'''
|
|
Creates a 20-itemId list to use for the ShoppingApi
|
|
call
|
|
'''
|
|
|
|
itemid_results_list = []
|
|
|
|
with open('cat_list.txt') as jf:
|
|
cat_list = json.load(jf)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(self.get_data, cat_list):
|
|
itemid_results_list.extend(future)
|
|
|
|
print(len(itemid_results_list))
|
|
a = list(set(itemid_results_list))
|
|
print(len(a))
|
|
|
|
with open('raw_ids.txt', 'w') as f:
|
|
json.dump(itemid_results_list, f)
|
|
|
|
# 20-ItemID list created to maximize dataset/decrease calls given call constraints
|
|
item_id_results = [','.join(itemid_results_list[n:n+20]) for n in list(range(0,
|
|
len(itemid_results_list), 20))]
|
|
|
|
return item_id_results, itemid_results_list
|
|
|
|
class ShoppingApi:
|
|
'''
|
|
Creates objects from ShoppingApi service calls that can interact with
|
|
pandas dataframes
|
|
'''
|
|
|
|
def update_cats(self):
|
|
'''
|
|
Updates cat_list.txt
|
|
'''
|
|
|
|
parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
|
|
cat_list = []
|
|
|
|
for department in parent_cats:
|
|
|
|
headers = {
|
|
"X-EBAY-API-IAF-TOKEN":cfg.sec['X-EBAY-API-IAF-TOKEN'], # TODO implement auto oauth token renewal
|
|
"version":"671",
|
|
}
|
|
|
|
url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=4)
|
|
response.raise_for_status()
|
|
|
|
except requests.exceptions.RequestException:
|
|
print('connection error')
|
|
|
|
response = response.json()
|
|
response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept.
|
|
|
|
temp_cat_list = [cat['CategoryID'] for cat in response]
|
|
cat_list.extend(temp_cat_list)
|
|
|
|
with open('cat_list.txt', 'w') as f:
|
|
json.dump(cat_list, f)
|
|
|
|
def get_item_from_findItemsByCategory(self, twenty_id):
|
|
|
|
'''
|
|
Gets raw JSON data from multiple live listings given multiple itemIds
|
|
'''
|
|
|
|
headers = {
|
|
"X-EBAY-API-IAF-TOKEN":cfg.sec['X-EBAY-API-IAF-TOKEN'], # TODO implement auto oauth token renewal
|
|
"version":"671",
|
|
}
|
|
|
|
url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id
|
|
|
|
try:
|
|
# random sleep here between 0 and 10 secs?
|
|
|
|
# sleep(randint(1,10)) # may not be necessary
|
|
response = requests.get(url, headers=headers,timeout=24)
|
|
response.raise_for_status()
|
|
response = response.json()
|
|
item = response['Item']
|
|
|
|
|
|
except (requests.exceptions.RequestException, KeyError):
|
|
print('connection error. IP limit possibly exceeded')
|
|
print(response)
|
|
return # returns NoneType. Handled at conky()
|
|
|
|
return item
|
|
|
|
def conky(self, twenty_ids_list):
|
|
'''
|
|
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
|
|
data for creating training sets
|
|
'''
|
|
try:
|
|
with open('raw_data.txt') as f:
|
|
data = json.load(f)
|
|
except (FileNotFoundError, ValueError):
|
|
data = []
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list):
|
|
if future is not None:
|
|
for item in future:
|
|
data.append(item) # The end result should be a list of dicts where each dict in the list is a listing
|
|
else:
|
|
print('response is None')
|
|
break
|
|
with open('raw_data.txt', 'w') as f:
|
|
json.dump(data, f)
|
|
return data
|
|
|
|
# NOTE:
|
|
|
|
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
|
|
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
|
|
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
|
# More than enough data for your dataset.
|
|
|
|
|
|
class CurateData:
|
|
'''
|
|
Contains methods for curating data for machine learning training sets;
|
|
Takes item in data from ShoppingApi request as argument and extracts/ creates key
|
|
value pairs that gets updated to custom dataframe used in Ml training sets.
|
|
'''
|
|
|
|
def import_raw(self):
|
|
'''
|
|
imports raw response json from local file. This is data from
|
|
GetMultipleItems call in ShoppingApi
|
|
'''
|
|
with open('raw_data.txt') as f:
|
|
raw_data = json.load(f)
|
|
return raw_data
|
|
|
|
def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
|
|
'''
|
|
creates pandas df from raw json and saves master raw csv file as raw_df.csv.
|
|
Indended to be used inline with direct
|
|
data stream from ebay's APIs
|
|
'''
|
|
to_json = json.dumps(raw_data)
|
|
raw_df = pd.read_json(StringIO(to_json))
|
|
raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
|
|
#raw_df = pd.read_csv('raw_df.csv', index_col=0)
|
|
#raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
|
|
#raw_df.to_csv('raw_df.csv')
|
|
|
|
# TODO still saving "Unnamed:0" column
|
|
|
|
return raw_df
|
|
|
|
def to_training(self, raw_data):
|
|
'''
|
|
creates first pass of potential labels for training set. This is the base
|
|
df used to produce other training sets to use.
|
|
'''
|
|
raw_df = self.raw_df(raw_data)
|
|
interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
|
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
|
|
training = interm_df1.dropna(subset=['ItemSpecifics'])
|
|
return training # TODO RENAME THIS FUNC AND its RETURN VALUE
|
|
|
|
def class_training(self, training):
|
|
'''Training set for multiclass portion of training set. Used to train
|
|
seprately from multilabel portion
|
|
'''
|
|
class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
|
|
return class_training
|
|
|
|
def nvl_training(self, training):
|
|
'''
|
|
Training set for multilabel portion
|
|
'''
|
|
interm_df1 = pd.Series(training.ItemSpecifics)
|
|
interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
|
|
|
|
# Necessary for json_normalize():
|
|
|
|
nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
|
|
nvl_df = pd.json_normalize(nvl_dict)
|
|
nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
|
|
|
|
return nvl_training
|
|
|
|
def extract_df(self, df):
|
|
'''
|
|
converts single-value lists of strings of any df to string if not null
|
|
'''
|
|
extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x)
|
|
|
|
return extracted_df
|
|
|
|
def drop_nvl_cols(self, nvl_training):
|
|
|
|
with open('cat_spacs.txt') as f:
|
|
cat_spacs = json.load(f)
|
|
|
|
drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line',
|
|
'Personalize', 'Fabric Type', 'Customized','Release Year',
|
|
'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade',
|
|
'Signed', 'Silhouette', 'Insole Material', 'Lining Material',
|
|
'California Prop 65 Warning', 'Character Family', 'Character',
|
|
'Cushioning Level', 'Personalization Instructions', 'Pronation',
|
|
]
|
|
drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features',
|
|
'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size',
|
|
'Country/Region of Manufacture', 'Brand', 'Model']
|
|
for cat in drop :
|
|
if cat in cat_spacs:
|
|
cat_spacs.remove(cat)
|
|
for cat in drop_2:
|
|
if cat in cat_spacs:
|
|
cat_spacs.remove(cat)
|
|
|
|
user_input = input('drop cols? (y,n; default=y): ')
|
|
|
|
if 'n' in user_input:
|
|
dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
|
|
else:
|
|
cols = []
|
|
for col in cat_spacs:
|
|
if col in list(nvl_training.columns):
|
|
cols.append(col)
|
|
cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training
|
|
dropd = nvl_training[cols]
|
|
|
|
return dropd
|
|
|
|
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
|
|
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
|
|
|
|
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
|
|
|
|
def expand_nvlclass(self, class_training, dropd):
|
|
'''
|
|
takes image url list from each cell and expands them into separate/duplicate
|
|
instances. Modifies both class training and dropd dfs. Appends custom
|
|
image url dict {'source':'target'}.
|
|
* consider applying this function to other cells that have multiple values in their lists
|
|
'''
|
|
expand = input("expand image list or use primary listing image? (y or n): ")
|
|
if ('y' or 'Y') in expand:
|
|
expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
|
|
expanded_class = expanded_class.dropna(subset=['PictureURL'])
|
|
expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
|
|
|
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
|
|
expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
|
|
expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
|
|
|
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
|
|
|
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
|
|
|
|
else:
|
|
class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
|
|
expanded_class = class_training.dropna()
|
|
dropd = dropd.dropna(subset=['PictureURL'])
|
|
dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
|
|
dropd = dropd.dropna(subset=['PictureURL'])
|
|
expanded_dropd = dropd
|
|
|
|
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
|
|
|
# retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
|
|
temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
|
|
|
|
try:
|
|
with open('temp_pics_source_list.txt') as f:
|
|
tpsl = json.load(f)
|
|
tpsl.extend(temp_pics_source_list)
|
|
|
|
# ensures no duplicate source URLs exist
|
|
temp_pics_source_list = list(set(tpsl))
|
|
with open('temp_pics_source_list.txt', 'w') as f:
|
|
json.dump(temp_pics_source_list, f)
|
|
|
|
# creates file if script is ran for 1st time and file not present
|
|
except (ValueError, FileNotFoundError):
|
|
with open('temp_pics_source_list.txt', 'w') as f:
|
|
json.dump(temp_pics_source_list, f)
|
|
|
|
# Append to master training dataframes, drop potential dupes and save
|
|
expanded_class.to_csv('expanded_class.csv')
|
|
expanded_dropd.to_csv('expanded_dropd.csv')
|
|
|
|
return expanded_class, expanded_dropd
|
|
|
|
def dl_pic(self,dict_pics, pic):
|
|
|
|
try:
|
|
|
|
if os.path.exists(dict_pics[pic]): # TODO DOES NOT FIND CURRENT PATHS BECUASE TAGS WILL NOW BE DIFFERENT. YOU WILL END UP RE DOWNLOADING IMAGES
|
|
pass
|
|
|
|
else:
|
|
try:
|
|
r = requests.get(pic, stream=True)
|
|
r.raw.decode_content = True
|
|
with open(dict_pics[pic], 'wb') as f:
|
|
shutil.copyfileobj(r.raw, f)
|
|
except ConnectionError:
|
|
|
|
return
|
|
|
|
except KeyError:
|
|
pass
|
|
|
|
def dict_pics(self):
|
|
|
|
try:
|
|
with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
|
|
target_dir = json.load(f)
|
|
except (ValueError, FileNotFoundError):
|
|
target_dir = input('No target dirctory found. Create One? [y] or [n]:')
|
|
if target_dir == ('y' or 'Y'):
|
|
target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
|
|
with open('target_dirs.txt','w') as f:
|
|
json.dump(target_dir, f)
|
|
else:
|
|
os.mkdir(os.getcwd()+os.sep+'training_images')
|
|
target_dir = os.getcwd()+os.sep+'training_images'
|
|
with open('target_dirs.txt','w') as f:
|
|
json.dump(target_dir, f)
|
|
print('Creating default folder in current directory @ ' + target_dir)
|
|
|
|
with open('temp_pics_source_list.txt') as f:
|
|
try:
|
|
temp_pics_source_list = json.load(f)
|
|
except (ValueError, FileNotFoundError):
|
|
print('url list not found. aborting')
|
|
return
|
|
|
|
dict_pics = {}
|
|
for k in temp_pics_source_list:
|
|
patt_1 = re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE)
|
|
patt_2 = re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE)
|
|
if patt_1 and patt_2 is not None:
|
|
tag = patt_1.group() + patt_2.group().lower()
|
|
file_name = target_dir + os.sep + tag
|
|
dict_pics.update({k:file_name})
|
|
|
|
# with open('dict_pics.txt', 'w') as f:
|
|
# json.dump(dict_pics, f)
|
|
|
|
return dict_pics # TODO still need to find sol to outliers (i.e., naming scheme for unusual source URLs)
|
|
|
|
def dl_pictures(self, *dict_pics):
|
|
'''
|
|
Downloads pictures from api to local storage using temp_pics_source_list
|
|
and dict_pics
|
|
'''
|
|
|
|
if not dict_pics:
|
|
with open('dict_pics.txt') as f:
|
|
dict_pics = json.load(f)
|
|
with open('temp_pics_source_list.txt') as f:
|
|
try:
|
|
temp_pics_source_list = json.load(f)
|
|
except (ValueError, FileNotFoundError):
|
|
print('url list not found. download aborted')
|
|
return
|
|
|
|
bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(lambda p: self.dl_pic(*p), bargs):
|
|
if future is not None:
|
|
future
|
|
else:
|
|
print('connection error')
|
|
|
|
class PreProcessing:
|
|
'''
|
|
Includes methods for pre-processing training set input and labels in the
|
|
training set created from CurateData class. Whereas CurateData training
|
|
sets provided trimmed down data from the raw json response from the
|
|
ShoppingApi call and provided a bare minimum format for the dataframe to be
|
|
used in training, PreProcessing optimizes that dataframe for training and
|
|
includes methods for image manipulation, creating test/train/validation
|
|
splits, etc.
|
|
'''
|
|
|
|
def dict_pics(self):
|
|
'''
|
|
Source to target training. Replaces source image URL with target URL
|
|
determined by values in dict_pics variable.
|
|
'''
|
|
|
|
target_dir = os.getcwd()
|
|
with open('temp_pics_source_list.txt') as f:
|
|
temp_pics_source_list = json.load(f)
|
|
dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
|
|
print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images')
|
|
return dict_pics
|
|
|
|
# TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
|
|
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
|
|
# if not exists and append to master img download dict
|
|
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
|
|
|
|
def main():
|
|
'''
|
|
Main program creates/updates a csv file to use for ML training from live
|
|
ebay listings
|
|
'''
|
|
pass
|
|
# main goes here:
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
'''
|
|
Based on your sample set of 10 images, if you have an average of 5 images per
|
|
listing and you download a hundred listings, you will have about 102 Gb of
|
|
image data. That's just for one day. If you have more than a million listings
|
|
you're looking at a little over 1Tb of image data. You don't even know if this
|
|
is good data yet.
|
|
'''
|