2021-11-10 22:37:26 +00:00
import os
2021-11-27 05:38:01 +00:00
from time import sleep
from random import randint
2021-11-27 01:46:51 +00:00
import scrape_ids
2021-11-21 23:05:20 +00:00
from datetime import datetime , timedelta
import dateutil
from dateutil import parser
import pytz
2021-11-21 19:47:04 +00:00
import pdb
2021-11-10 22:37:26 +00:00
from io import StringIO
import numpy as np
import concurrent . futures
import json
import requests
import pandas as pd
import config as cfg
import shutil
import re
2021-11-30 19:32:34 +00:00
from ebaysdk . exception import ConnectionError
from ebaysdk . trading import Connection as Trading
2021-11-10 22:37:26 +00:00
from ebaysdk . finding import Connection as Finding
from ebaysdk . shopping import Connection as Shopping
class ShoppingApi :
'''
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
'''
def update_cats ( self ) :
'''
Updates cat_list . txt
'''
parent_cats = [ ' 3034 ' , ' 93427 ' ] # Women's and Men's shoe departments
cat_list = [ ]
for department in parent_cats :
params = {
" callname " : " GetCategoryInfo " ,
2021-11-30 19:32:34 +00:00
" X-EBAY-API-IAF-TOKEN " : cfg . sec [ ' X-EBAY-API-IAF-TOKEN ' ] ,
2021-11-10 22:37:26 +00:00
" version " : " 671 " ,
" responseencoding " : " JSON " ,
" CategoryID " : department ,
" IncludeSelector " : " ChildCategories " ,
}
try :
response = requests . get ( " https://open.api.ebay.com/shopping? " , params = params , timeout = 4 )
response . raise_for_status ( )
except requests . exceptions . RequestException :
print ( ' connection error ' )
response = response . json ( )
2021-11-30 19:32:34 +00:00
response = response [ ' CategoryArray ' ] [ ' Category ' ] [ 1 : ] # excludes index 0 as this is parent node, i.e., women's or men's dept.
2021-11-10 22:37:26 +00:00
temp_cat_list = [ cat [ ' CategoryID ' ] for cat in response ]
cat_list . extend ( temp_cat_list )
with open ( ' cat_list.txt ' , ' w ' ) as f :
json . dump ( cat_list , f )
# leaf_list = [node['LeafCategory'] for node in response]
def get_item_from_findItemsByCategory ( self , twenty_id ) :
'''
Gets raw JSON data from multiple live listings given multiple itemIds
'''
2021-11-27 05:38:01 +00:00
with open ( ' ids.txt ' ) as f :
ids = json . load ( f )
item_id_results = [ ' , ' . join ( ids [ n : n + 20 ] ) for n in list ( range ( 0 , len ( ids ) , 20 ) ) ] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
2021-11-10 22:37:26 +00:00
headers = {
" X-EBAY-API-IAF-TOKEN " : cfg . sec [ ' X-EBAY-API-IAF-TOKEN ' ] , # TODO implement auto oauth token renewal
" version " : " 671 " ,
}
url = " https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID= " + twenty_id
try :
2021-11-27 05:38:01 +00:00
# random sleep here between 0 and 10 secs?
2021-11-28 07:17:34 +00:00
2021-11-30 19:32:34 +00:00
sleep ( randint ( 1 , 10 ) ) # may not be necessary
2021-11-27 05:38:01 +00:00
response = requests . get ( url , headers = headers , timeout = 24 )
2021-11-10 22:37:26 +00:00
response . raise_for_status ( )
2021-11-28 07:17:34 +00:00
response = response . json ( )
response = response [ ' Item ' ]
2021-11-30 19:32:34 +00:00
print ( ' index number {} ' . format ( item_id_results . index ( twenty_id ) ) )
print ( response )
2021-11-28 07:17:34 +00:00
2021-11-10 22:37:26 +00:00
2021-11-30 19:32:34 +00:00
except ( requests . exceptions . RequestException , KeyError ) :
2021-11-28 07:17:34 +00:00
print ( ' connection error. IP limit possibly exceeded ' )
2021-11-27 05:38:01 +00:00
print ( ' index number {} ' . format ( item_id_results . index ( twenty_id ) ) )
2021-11-28 08:19:59 +00:00
return # returns NoneType. Handle at conky()
2021-11-10 22:37:26 +00:00
return response
def conky ( self ) :
'''
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
data for creating training sets
'''
try :
with open ( ' raw_data.txt ' ) as f :
data = json . load ( f )
2021-11-30 19:32:34 +00:00
except ( FileNotFoundError , ValueError ) :
2021-11-10 22:37:26 +00:00
data = [ ]
2021-11-27 03:40:26 +00:00
try :
with open ( ' ids.txt ' ) as f :
ids = json . load ( f )
2021-11-27 05:38:01 +00:00
item_id_results = [ ' , ' . join ( ids [ n : n + 20 ] ) for n in list ( range ( 0 , len ( ids ) , 20 ) ) ] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
2021-11-27 03:40:26 +00:00
except ( FileNotFoundError , ValueError ) :
item_id_results = scrape_ids . main ( )
2021-11-27 05:38:01 +00:00
2021-11-30 19:32:34 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2021-11-10 22:37:26 +00:00
for future in executor . map ( self . get_item_from_findItemsByCategory , item_id_results ) :
2021-11-28 08:19:59 +00:00
if future is not None :
for item in future :
2021-11-30 19:32:34 +00:00
data . append ( item ) # The end result should be a list of dicts where each dict in the list is a listing
2021-11-28 08:19:59 +00:00
else :
2021-11-30 19:32:34 +00:00
print ( ' reached call limit ' )
2021-11-28 08:19:59 +00:00
break
2021-11-10 22:37:26 +00:00
with open ( ' raw_data.txt ' , ' w ' ) as f :
2021-11-28 08:19:59 +00:00
json . dump ( data , f )
2021-11-10 22:37:26 +00:00
# NOTE:
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
class CurateData :
'''
Contains methods for curating data for machine learning training sets ;
Takes item in data from ShoppingApi request as argument and extracts / creates key
value pairs that gets updated to custom dataframe used in Ml training sets .
'''
def import_raw ( self ) :
'''
imports raw response json from local file . This is data from
GetMultipleItems call in ShoppingApi
'''
with open ( ' raw_data.txt ' ) as f :
raw_data = json . load ( f )
return raw_data
def raw_df ( self , raw_data ) : # TODO not dropping dupes, and is appending raw_data for some reason
'''
creates pandas df from raw json and saves master raw csv file as raw_df . csv .
Indended to be used inline with direct
data stream from ebay ' s APIs
'''
to_json = json . dumps ( raw_data )
raw_df = pd . read_json ( StringIO ( to_json ) )
raw_df . to_csv ( ' raw_df.csv ' ) # NOTE not append mode because raw_df is made from the master raw_data.txt file
#raw_df = pd.read_csv('raw_df.csv', index_col=0)
#raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
#raw_df.to_csv('raw_df.csv')
# TODO still saving "Unnamed:0" column
return raw_df
def to_training ( self , raw_data ) :
'''
creates first pass of potential labels for training set . This is the base
df used to produce other training sets to use .
'''
raw_df = self . raw_df ( raw_data )
interm_df1 = raw_df . loc [ : , [ ' ItemID ' , ' PictureURL ' , ' PrimaryCategoryID ' , ' PrimaryCategoryName ' , ' Title ' , ' ItemSpecifics ' ] ]
interm_df1 [ [ ' ItemID ' , ' PrimaryCAegoryID ' ] ] = interm_df1 . loc [ : , [ ' ItemID ' , ' PrimaryCategoryID ' ] ] . astype ( str )
training = interm_df1 . dropna ( subset = [ ' ItemSpecifics ' ] )
return training # TODO RENAME THIS FUNC AND its RETURN VALUE
def class_training ( self , training ) :
''' Training set for multiclass portion of training set. Used to train
seprately from multilabel portion
'''
class_training = training . loc [ : , [ ' PictureURL ' , ' PrimaryCategoryID ' ] ]
return class_training
def nvl_training ( self , training ) :
'''
Training set for multilabel portion
'''
interm_df1 = pd . Series ( training . ItemSpecifics )
interm_df1 = interm_df1 . apply ( lambda x : x [ ' NameValueList ' ] )
# Necessary for json_normalize():
nvl_dict = interm_df1 . apply ( lambda x : { k : v for ( k , v ) in zip ( [ n [ ' Name ' ] for n in x ] , [ v [ ' Value ' ] for v in x ] ) } )
nvl_df = pd . json_normalize ( nvl_dict )
nvl_training = pd . concat ( [ pd . Series ( training . PictureURL ) , nvl_df ] , axis = 1 )
return nvl_training
def extract_df ( self , df ) :
'''
converts single - value lists of strings of any df to string if not null
'''
extracted_df = df . applymap ( lambda x : ' ' . join ( x ) if isinstance ( x , list ) else np . nan if pd . isnull ( x ) else x )
return extracted_df
def drop_nvl_cols ( self , nvl_training ) :
with open ( ' cat_spacs.txt ' ) as f :
cat_spacs = json . load ( f )
drop = [ ' Year Manufactured ' , ' MPN ' , ' Platform Height ' , ' Product Line ' ,
' Personalize ' , ' Fabric Type ' , ' Customized ' , ' Release Year ' ,
' Heel to Toe Drop ' , ' Midsole Type ' , ' Cleat Type ' , ' Handmade ' ,
' Signed ' , ' Silhouette ' , ' Insole Material ' , ' Lining Material ' ,
' California Prop 65 Warning ' , ' Character Family ' , ' Character ' ,
' Cushioning Level ' , ' Personalization Instructions ' , ' Pronation ' ,
]
drop_2 = [ ' Calf Width ' , ' Theme ' , ' Outsole Material ' , ' Style Code ' , ' Features ' ,
' EU Shoe Size ' , ' AU Shoe Size ' , ' Vintage ' , ' US Shoe Size ' ,
' Country/Region of Manufacture ' , ' Brand ' , ' Model ' ]
for cat in drop :
if cat in cat_spacs :
cat_spacs . remove ( cat )
for cat in drop_2 :
if cat in cat_spacs :
cat_spacs . remove ( cat )
user_input = input ( ' drop cols? (y,n; default=y): ' )
if ' n ' in user_input :
dropd = nvl_training #.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
else :
cols = [ ]
for col in cat_spacs :
if col in list ( nvl_training . columns ) :
cols . append ( col )
cols . insert ( 0 , ' PictureURL ' ) # list of other cols that aren't needed for training
dropd = nvl_training [ cols ]
return dropd
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
def expand_nvlclass ( self , class_training , dropd ) :
'''
takes image url list from each cell and expands them into separate / duplicate
instances . Modifies both class training and dropd dfs . Appends custom
image url dict { ' source ' : ' target ' } .
* consider applying this function to other cells that have multiple values in their lists
'''
expand = input ( " expand image list or use primary listing image? (y or n): " )
if ( ' y ' or ' Y ' ) in expand :
expanded_class = class_training . explode ( ' PictureURL ' ) . reset_index ( drop = True )
expanded_class = expanded_class . dropna ( subset = [ ' PictureURL ' ] )
expanded_class = expanded_class . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
expanded_dropd = dropd . explode ( ' PictureURL ' ) . reset_index ( drop = True )
expanded_dropd = expanded_dropd . dropna ( subset = [ ' PictureURL ' ] )
expanded_dropd = expanded_dropd . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
expanded_dropd = self . extract_df ( expanded_dropd ) # convert lists to values
temp_pics_source_list = list ( set ( expanded_class . PictureURL . to_list ( ) ) ) # TODO because var is del after dl_pictures you may be
# getting duplicate pictures. ie, expanded_class.PictureURL is a master series and will write temp_pics_source_list as such
# giving you many repeated pictureURLs (they will not get downloaded due to check @ dl_pic but checking will cont to grow in
# computate power reqs. So, figure out a way to make a true temp list based on the current call executed
else :
class_training [ ' PictureURL ' ] = class_training [ ' PictureURL ' ] . apply ( lambda x : x [ 0 ] )
expanded_class = class_training
dropd [ ' PictureURL ' ] = dropd [ ' PictureURL ' ] . apply ( lambda x : x [ 0 ] )
expanded_dropd = dropd
expanded_dropd = self . extract_df ( expanded_dropd ) # convert lists to values
temp_pics_source_list = list ( set ( expanded_class . PictureURL . to_list ( ) ) )
try :
with open ( ' temp_pics_source_list.txt ' ) as f :
tpsl = json . load ( f )
tpsl . extend ( temp_pics_source_list )
temp_pics_source_list = list ( set ( tpsl ) )
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
json . dump ( temp_pics_source_list , f )
except ( ValueError , FileNotFoundError ) :
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
json . dump ( temp_pics_source_list , f )
# Append to master training dataframes, drop potential dupes and save
expanded_class . to_csv ( ' expanded_class.csv ' )
# expanded_class = pd.read_csv('expanded_class.csv', index_col=0)
# expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_class.to_csv('expanded_class.csv', mode='a', encoding='utf-8') # TODO see line 235 about views and copies
expanded_dropd . to_csv ( ' expanded_dropd.csv ' )
# expanded_dropd = pd.read_csv('expanded_dropd.csv', index_col=0)
# expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
# expanded_dropd.to_csv('expanded_dropd.csv', mode='a', encoding='utf-8')
return expanded_class , expanded_dropd
def dl_pictures ( self , * args ) :
'''
Downloads pictures from api to local storage using temp_pics_source_list
and creates custom { source : target } dictionary as dict_pics
'''
# TODO add option to include only first image of each listing as
# others may be crappy for training. Also consider adding option to
# reduce the size of each pic downloaded
try :
with open ( ' target_dirs.txt ' , ' r+ ' ) as f : # TODO you can add option to change directory here, too. Look up how to have optional arguments
target_dir = json . load ( f )
except ( ValueError , FileNotFoundError ) :
target_dir = input ( ' No target dirctory found. Create One? [y] or [n]: ' )
if target_dir == ( ' y ' or ' Y ' ) :
target_dir = input ( ' Please provide full URL to destination folder: ' ) # TODO need to catch human syntax errors here
with open ( ' target_dirs.txt ' , ' w ' ) as f :
json . dump ( target_dir , f )
else :
os . mkdir ( os . getcwd ( ) + os . sep + ' training_images ' )
target_dir = os . getcwd ( ) + os . sep + ' training_images '
with open ( ' target_dirs.txt ' , ' w ' ) as f :
json . dump ( target_dir , f )
print ( ' Creating default folder in current directory @ ' + target_dir )
with open ( ' temp_pics_source_list.txt ' ) as f :
try :
if args :
temp_pics_source_list = args [ 0 ]
else :
temp_pics_source_list = json . load ( f )
except ( ValueError , FileNotFoundError ) :
if args :
temp_pics_sources_list = args [ 0 ]
else :
print ( ' url list not found. download aborted ' )
return
temp_dict_pics = { k : target_dir + os . sep + re . search ( r ' [^/]+(?=/ \ $_|.jpg) ' , k , re . IGNORECASE ) . group ( ) + ' .jpg ' for k in temp_pics_source_list }
try :
with open ( ' dict_pics.txt ' ) as f :
dict_pics = json . load ( f )
dict_pics . update ( temp_dict_pics ) # TODO This still creates duplicates
with open ( ' dict_pics.txt ' , ' w ' ) as f :
json . dump ( dict_pics , f )
except ( ValueError , FileNotFoundError ) :
with open ( ' dict_pics.txt ' , ' w ' ) as f :
json . dump ( temp_dict_pics , f )
dict_pics = temp_dict_pics
def dl_pic ( dict_pics , pic ) :
if os . path . exists ( dict_pics [ pic ] ) : # or call temp_dict_pics[pic] can work
pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
#TODO it would mean that temp_pics_source_list is changing for some reason?
else :
2021-11-30 19:32:34 +00:00
try :
r = requests . get ( pic , stream = True )
r . raw . decode_content = True
with open ( temp_dict_pics [ pic ] , ' wb ' ) as f : # Or call dict_pics[pic] can work
shutil . copyfileobj ( r . raw , f )
except ConnectionError :
return
2021-11-10 22:37:26 +00:00
bargs = [ ( dict_pics , pic ) for pic in temp_pics_source_list ]
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
for future in executor . map ( lambda p : dl_pic ( * p ) , bargs ) :
2021-11-30 19:32:34 +00:00
if future is not None :
future
else :
print ( ' connection error ' )
2021-11-10 22:37:26 +00:00
os . remove ( ' temp_pics_source_list.txt ' ) # Deletes file after downloads complete successfully
class PreProcessing :
'''
Includes methods for pre - processing training set input and labels in the
training set created from CurateData class . Whereas CurateData training
sets provided trimmed down data from the raw json response from the
ShoppingApi call and provided a bare minimum format for the dataframe to be
used in training , PreProcessing optimizes that dataframe for training and
includes methods for image manipulation , creating test / train / validation
splits , etc .
'''
def stt_training ( self , dict_pics , expanded_class , expanded_dropd ) :
'''
Source to target training . Replaces source image URL with target URL
determined by values in dict_pics variable .
'''
pass
# TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
# if not exists and append to master img download dict
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
def main ( ) :
'''
Main program creates / updates a csv file to use for ML training from live
ebay listings
'''
pass
# main goes here:
if __name__ == " __main__ " :
main ( )
'''
Based on your sample set of 10 images , if you have an average of 5 images per
listing and you download a hundred listings , you will have about 102 Gb of
image data . That ' s just for one day. If you have more than a million listings
you ' re looking at a little over 1Tb of image data. You don ' t even know if this
is good data yet .
'''