2021-01-23 06:21:56 +00:00
import importlib
2021-04-21 04:09:35 +00:00
import pdb
2021-04-10 05:37:14 +00:00
import os
2021-01-23 06:21:56 +00:00
import numpy as np
2020-11-09 01:47:03 +00:00
import concurrent . futures
2020-10-12 07:53:29 +00:00
import json
import requests
import pandas as pd
2021-02-01 04:49:24 +00:00
import config as cfg
2021-04-03 06:42:31 +00:00
import shutil
2021-04-04 21:38:04 +00:00
import re
2020-10-12 07:53:29 +00:00
class FindingApi :
2021-05-10 22:14:57 +00:00
'''
Methods for accessing eBay ' s FindingApi services
'''
2020-10-12 18:48:15 +00:00
def __init__ ( self , service , pageNumber ) :
2020-10-12 07:53:29 +00:00
self . service = [
' findItemsAdvanced ' , ' findCompletedItems ' ,
' findItemsByKeywords ' , ' findItemsIneBayStores ' , ' findItemsByCategory ' ,
' findItemsByProduct '
2020-10-12 18:48:15 +00:00
] [ service ]
2021-02-01 04:49:24 +00:00
self . pageNumber = list ( range ( 1 , pageNumber ) ) # 77 pgs will give equal weights to cats given call constraints
2020-10-12 07:53:29 +00:00
2021-01-30 10:08:10 +00:00
# examples of additional params you may want to add:
# 'itemFilter(0).value':'Used'
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
2021-05-10 22:14:57 +00:00
def update_cats ( self ) :
parent_cats = [ ' 3034 ' , ' 93427 ' ]
cat_list = [ ]
for department in parent_cats :
params = {
" callname " : " GetCategoryInfo " ,
" appid " : cfg . sec [ ' SECURITY-APPNAME ' ] ,
" version " : " 671 " ,
" responseencoding " : " JSON " ,
" CategoryID " : department ,
" IncludeSelector " : " ChildCategories " ,
}
try :
response = requests . get ( " https://open.api.ebay.com/shopping? " , params = params , timeout = 1 )
response . raise_for_status ( )
except requests . exceptions . RequestException :
print ( ' connection error ' )
response = response . json ( )
response = response [ ' CategoryArray ' ] [ ' Category ' ] [ 1 : ]
temp_cat_list = [ cat [ ' CategoryID ' ] for cat in response ]
cat_list . extend ( temp_cat_list )
with open ( ' cat_list.txt ' , ' w ' ) as f :
json . dump ( cat_list , f )
# leaf_list = [node['LeafCategory'] for node in response]
return cat_list
2021-01-25 05:46:55 +00:00
def get_data ( self , category_id , i ) :
2020-11-12 20:22:51 +00:00
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
Gets raw JSON data fom FindingApi service call
Currently being used to get itemIDs from categories
'''
params = {
" OPERATION-NAME " : self . service ,
2021-02-01 04:49:24 +00:00
" SECURITY-APPNAME " : cfg . sec [ ' SECURITY-APPNAME ' ] ,
2020-11-12 20:22:51 +00:00
" SERVICE-VERSION " : " 1.13.0 " ,
" RESPONSE-DATA-FORMAT " : " JSON " ,
" categoryId " : category_id ,
" paginationInput.entriesPerPage " : " 100 " ,
" paginationInput.PageNumber " : i
}
2020-12-25 19:15:20 +00:00
2021-02-14 08:34:56 +00:00
# TODO add try excepts here
try :
response = requests . get ( " https://svcs.ebay.com/services/search/FindingService/v1 " ,
2021-04-02 18:08:56 +00:00
params = params , timeout = 3 )
response . raise_for_status ( )
2020-12-25 19:15:20 +00:00
2021-04-02 18:08:56 +00:00
except requests . exceptions . RequestException :
print ( ' connection error ' ) #TODO DECIDE HOW TO HANDLE EXCEPTION
2020-11-12 20:22:51 +00:00
data = response . json ( )
return data
2020-10-12 07:53:29 +00:00
2021-01-30 10:08:10 +00:00
# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe
# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
# it to the first 5 pictures instead of random.
# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
# to see which one is more accurate or if a combo of both is more accurate.
2020-12-25 19:15:20 +00:00
2020-10-12 07:53:29 +00:00
def get_ids_from_cats ( self ) :
2020-10-12 18:48:15 +00:00
'''
2020-10-18 00:22:45 +00:00
Creates a 20 - itemId list to use for the ShoppingApi
2020-10-12 18:48:15 +00:00
call
'''
2020-12-13 01:56:51 +00:00
pages = self . pageNumber
2020-10-12 07:53:29 +00:00
itemid_results_list = [ ]
2020-11-05 22:32:41 +00:00
2020-12-25 19:15:20 +00:00
with open ( ' cat_list.txt ' ) as jf :
cat_list = json . load ( jf )
2020-11-12 20:22:51 +00:00
for category_id in cat_list :
2020-12-25 19:15:20 +00:00
args = [ ( category_id , i ) for i in pages ]
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-25 19:15:20 +00:00
for future in executor . map ( lambda p : self . get_data ( * p ) , args ) :
2020-11-12 20:22:51 +00:00
data = future
2021-01-30 06:48:19 +00:00
try : # TODO if conditionals are not working due to each thread checking the same unedited item_id_results list
2020-11-12 20:22:51 +00:00
training = pd . read_csv ( ' training.csv ' )
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if ( item not in training . values ) and ( item not in itemid_results_list ) :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-11-12 20:22:51 +00:00
except ( pd . errors . EmptyDataError , FileNotFoundError ) :
for item in data [ ' findItemsByCategoryResponse ' ] [ 0 ] [ ' searchResult ' ] [ 0 ] [ ' item ' ] :
if item not in itemid_results_list :
itemid_results_list . append ( item [ ' itemId ' ] [ 0 ] )
2020-11-05 22:32:41 +00:00
2020-12-28 04:13:12 +00:00
item_id_results = list ( set ( itemid_results_list ) )
item_id_results = [ ' , ' . join ( itemid_results_list [ n : n + 20 ] ) for n in list ( range ( 0 ,
len ( itemid_results_list ) , 20 ) ) ]
2020-10-12 07:53:29 +00:00
return item_id_results
2021-01-30 10:08:10 +00:00
# TODO during your try except conditionals just check the csv files. At the end you can create sets. You can creat another condition that says if the final set is smaller than 100k then you can call finding
# service on more pages (but only pages you haven't tried) and repeat the search process.
2021-01-25 05:46:55 +00:00
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
2020-12-25 19:15:20 +00:00
# for category_id in cat_list:
2020-10-18 07:08:04 +00:00
class ShoppingApi :
2020-10-12 07:53:29 +00:00
'''
2020-10-12 18:48:15 +00:00
Creates objects from ShoppingApi service calls that can interact with
pandas dataframes
2020-10-12 07:53:29 +00:00
'''
2020-11-12 20:22:51 +00:00
def get_item_from_findItemsByCategory ( self , twenty_id ) :
2020-10-18 00:22:45 +00:00
'''
2020-10-18 20:56:16 +00:00
Gets raw JSON data from multiple live listings given multiple itemIds
2020-10-18 00:22:45 +00:00
'''
2020-11-12 20:22:51 +00:00
params = {
" callname " : " GetMultipleItems " ,
2021-02-14 08:34:56 +00:00
" appid " : cfg . sec [ ' SECURITY-APPNAME ' ] ,
2020-11-12 20:22:51 +00:00
" version " : " 671 " ,
" responseencoding " : " JSON " ,
" ItemID " : twenty_id ,
" IncludeSelector " : " ItemSpecifics " ,
}
2021-02-14 08:34:56 +00:00
# TODO Add try excepts here
try :
response = requests . get ( " https://open.api.ebay.com/shopping? " , params = params , timeout = 1 )
2021-04-02 18:08:56 +00:00
response . raise_for_status ( )
2021-04-16 01:26:42 +00:00
2021-05-10 22:14:57 +00:00
except requests . exceptions . RequestException : # TODO need better handling
2021-04-02 18:08:56 +00:00
print ( ' connection error ' )
2021-04-16 01:26:42 +00:00
2020-11-12 20:22:51 +00:00
response = response . json ( )
2020-12-13 01:56:51 +00:00
response = response [ ' Item ' ]
2021-04-16 01:26:42 +00:00
2020-11-12 20:22:51 +00:00
return response
def conky ( self ) :
2020-12-13 01:56:51 +00:00
'''
2021-05-10 22:14:57 +00:00
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
data for creating training sets
2020-12-13 01:56:51 +00:00
'''
2021-04-16 01:26:42 +00:00
data = [ ]
finding = FindingApi ( 4 , 2 ) # TODO replace these test values before production
2020-12-13 01:56:51 +00:00
item_id_results = finding . get_ids_from_cats ( )
2020-11-12 20:22:51 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2020-12-13 01:56:51 +00:00
for future in executor . map ( self . get_item_from_findItemsByCategory , item_id_results ) :
2020-12-29 08:55:21 +00:00
# print(future)
for item in future :
data . append ( item ) # The end result should be a list of dicts where each dict in the list is a listing
2020-12-29 07:20:55 +00:00
# data.update(future)
2021-04-02 18:08:56 +00:00
# TODO save data here. You'll use this with your curate data class. SAve this as text file
2020-12-29 07:20:55 +00:00
return data # TODO each future is a list of dictionaries because the output of any multithreader in this method is a list.
2021-04-02 18:08:56 +00:00
2020-12-29 07:20:55 +00:00
# data dictionary can't update from list of dicts unless iterated over. Might need a different way to update.
2021-01-23 06:21:56 +00:00
# TODO It seems like the problem with updating the dictionary/csv file is starting here possibly; I think the item data is getting appended out of order from the item itself.
2021-04-02 18:08:56 +00:00
2020-10-18 00:22:45 +00:00
class CurateData :
'''
2021-05-10 22:14:57 +00:00
Contains methods for curating data for machine learning training sets ;
2020-11-09 01:47:03 +00:00
Takes item in data from ShoppingApi request as argument and extracts / creates key
value pairs that gets updated to custom dataframe used in Ml training sets .
2020-10-18 00:22:45 +00:00
'''
2020-11-09 01:47:03 +00:00
2021-01-27 06:01:00 +00:00
def import_raw ( self ) :
2021-04-02 18:08:56 +00:00
'''
imports raw response json from local file
'''
2021-01-27 06:01:00 +00:00
with open ( ' raw_data.txt ' ) as f :
raw_data = json . load ( f )
return raw_data
2020-11-09 01:47:03 +00:00
2021-04-02 18:08:56 +00:00
def raw_df ( self , raw_data ) :
'''
creates pandas df from raw json . Indended to be used inline with direct
data stream from ebay ' s APIs
'''
to_json = json . dumps ( raw_data )
2021-01-23 06:21:56 +00:00
raw_df = pd . read_json ( to_json )
return raw_df
2021-01-25 05:46:55 +00:00
2021-04-02 18:08:56 +00:00
def to_training ( self , raw_data ) : # NOTE need to create copies not views
'''
creates first pass of potential labels for training set . This is the base
df used to produce other training sets to use .
'''
raw_df = self . raw_df ( raw_data )
2021-02-01 05:13:52 +00:00
interm_df1 = raw_df . loc [ : , [ ' ItemID ' , ' PictureURL ' , ' PrimaryCategoryID ' , ' PrimaryCategoryName ' , ' Title ' , ' ItemSpecifics ' ] ]
interm_df1 [ [ ' ItemID ' , ' PrimaryCAegoryID ' ] ] = interm_df1 . loc [ : , [ ' ItemID ' , ' PrimaryCategoryID ' ] ] . astype ( str )
2021-01-27 06:01:00 +00:00
training = interm_df1
2021-04-02 18:08:56 +00:00
return training # TODO RENAME THIS FUNC AND RETURN VALUE
2021-01-27 06:01:00 +00:00
2021-01-30 06:48:19 +00:00
def class_training ( self , training ) :
2021-04-02 18:08:56 +00:00
''' Training set for multiclass portion of training set. Used to train
seprately from multilabel portion
'''
2021-02-07 19:25:37 +00:00
class_training = training . loc [ : , [ ' PictureURL ' , ' PrimaryCategoryID ' ] ]
2021-01-30 06:48:19 +00:00
return class_training
def nvl_training ( self , training ) :
2021-04-02 18:08:56 +00:00
'''
Training set for multilabel portion
'''
2021-01-27 06:01:00 +00:00
interm_df1 = pd . Series ( training . ItemSpecifics )
interm_df1 = interm_df1 . apply ( lambda x : x [ ' NameValueList ' ] )
2021-04-16 01:26:42 +00:00
# Necessary for json_normalize():
2021-01-27 06:01:00 +00:00
nvl_dict = interm_df1 . apply ( lambda x : { k : v for ( k , v ) in zip ( [ n [ ' Name ' ] for n in x ] , [ v [ ' Value ' ] for v in x ] ) } )
2021-01-30 06:48:19 +00:00
nvl_df = pd . json_normalize ( nvl_dict )
nvl_training = pd . concat ( [ pd . Series ( training . PictureURL ) , nvl_df ] , axis = 1 )
2021-04-03 20:09:21 +00:00
2021-01-30 06:48:19 +00:00
return nvl_training
2021-02-07 19:25:37 +00:00
2021-04-04 21:38:04 +00:00
def extract_df ( self , df ) :
2021-04-02 18:08:56 +00:00
'''
converts single - value lists of strings of any df to string if not null
'''
2021-04-04 21:38:04 +00:00
extracted_df = df . applymap ( lambda x : ' ' . join ( x ) if isinstance ( x , list ) else np . nan if pd . isnull ( x ) else x )
2021-04-03 20:09:21 +00:00
2021-02-14 08:34:56 +00:00
return extracted_df
2021-02-09 03:16:58 +00:00
2021-05-10 22:14:57 +00:00
def drop_nvl_cols ( self , nvl_training ) : # NOTE this is wonky
2021-02-07 19:25:37 +00:00
col_drop = [
' Fabric Type ' , ' Type of Sport ' , ' Mid Sole ' , ' Modified Item ' ,
' Modification Description ' , ' Article Type ' , ' Customized ' ,
' Character ' , ' Features ' , ' Colors ' , ' Shade ' , ' Product ID ' ,
' Personalized ' , ' Platform Height ' , ' Year Manufactured ' ,
' Trim Material ' , ' Fashion Element ' , ' Shaft Material ' ,
' Character Family ' , ' Heel to Toe Drop ' , ' Custom Bundle ' ,
' California Prop 65 Warning ' , ' Manufacturer Color ' , ' Main Color ' ,
' Collection ' , ' Midsole Type ' , ' Signed ' , ' US Shoe Size (Men#!#s) ' ,
' Calf Circumference ' , ' Handmade ' , ' Safety Standards ' ,
' Customised ' , ' Cleat Type ' , ' Cushioning Level ' , ' AU Shoe Size ' ,
' Country/Region of Manufacture ' , ' Type of Sport ' , ' Main Colour ' ,
2021-04-02 18:08:56 +00:00
' Look ' , ' Sole Type ' , ' Manufacturer Colour ' , ' Sole Material ' ,
2021-02-07 19:25:37 +00:00
' Toe Material ' , ' Feature ' , ' Length ' , ' Width ' , ' Size Chart ' ,
' Boot Height ' , ' Water Resistance Level ' , ' Material Composition ' ,
' Calf Width ' , ' Insole Material ' , ' UPC ' , ' Size Type '
]
col_keep = [
2021-02-14 08:34:56 +00:00
' PictureURL ' , ' Style ' , ' Department ' , ' Type ' , ' Gender ' , ' Closure ' , ' Performance/Activity ' ,
2021-02-07 19:25:37 +00:00
' Accents ' , ' Occasion ' , ' Toe Shape ' , ' Pattern ' , ' Activity ' ,
' Heel Style ' , ' Fastening ' , ' Heel Type ' , ' Toe Type ' , ' Departement ' ,
' Product Type ' , ' Sub Style ' , ' Season ' , ' Theme ' , ' Upper Material ' ,
]
# May be no difference between Product type and sub style; fastening and
# closure; toe shape and toe type; occasion and performance/activity;
# see if you can combine these somehow (you may not want this though).
# Also consider keeping only cols that have plenty of values
2021-04-03 20:09:21 +00:00
# Run some value_count() analysis to determine frequencies and filter
# user created item specifics, leaving only predefined ebay item specs
2021-02-14 08:34:56 +00:00
2021-02-07 19:25:37 +00:00
user_input = input ( ' drop or keep cols?: ' )
if ' keep ' in user_input :
2021-04-02 18:08:56 +00:00
dropd = nvl_training . loc [ : , col_keep ]
2021-02-07 19:25:37 +00:00
else :
2021-04-02 18:08:56 +00:00
dropd = nvl_training . drop ( col_drop , axis = 1 )
return dropd
2021-02-07 19:25:37 +00:00
2021-04-04 21:38:04 +00:00
def expand_nvlclass ( self , class_training , dropd ) :
2021-02-14 08:34:56 +00:00
'''
2021-04-02 18:08:56 +00:00
takes image url list from each cell and expands them into separate / duplicate
instances . Modifies both class training and dropd dfs . Appends custom
2021-04-16 01:26:42 +00:00
image url dict { ' source ' : ' target ' } .
2021-02-14 08:34:56 +00:00
'''
2021-04-07 22:50:23 +00:00
expanded_class = class_training . explode ( ' PictureURL ' ) . reset_index ( drop = True ) # TODO drop duplicates here or before instantiating curate object
2021-04-05 21:38:07 +00:00
expanded_class = expanded_class . dropna ( subset = [ ' PictureURL ' ] )
2021-04-10 05:37:14 +00:00
expanded_class = expanded_class . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
# expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
2021-04-07 22:50:23 +00:00
expanded_dropd = dropd . explode ( ' PictureURL ' ) . reset_index ( drop = True ) # TODO Drop duplicates here or before instantiating curate object
2021-04-05 21:38:07 +00:00
expanded_dropd = expanded_dropd . dropna ( subset = [ ' PictureURL ' ] )
2021-04-10 05:37:14 +00:00
expanded_dropd = expanded_dropd . drop_duplicates ( subset = [ ' PictureURL ' ] ) . reset_index ( drop = True )
# expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
2021-04-07 22:50:23 +00:00
2021-04-04 21:38:04 +00:00
expanded_dropd = self . extract_df ( expanded_dropd ) # convert lists to values
2021-04-02 18:08:56 +00:00
2021-04-16 01:26:42 +00:00
temp_pics_source_list = list ( set ( expanded_class . PictureURL . to_list ( ) ) ) # prolly need to create set long before df... immediately after Shopping or trading call
2021-04-04 21:45:56 +00:00
# defined in the download function
2021-04-04 21:38:04 +00:00
2021-04-28 05:42:55 +00:00
try :
with open ( ' temp_pics_source_list.txt ' ) as f :
2021-05-10 22:14:57 +00:00
tpsl = json . load ( f )
tpsl . extend ( temp_pics_source_list )
temp_pics_source_list = list ( set ( tpsl ) )
2021-04-28 05:42:55 +00:00
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
json . dump ( temp_pics_source_list , f )
except ( ValueError , FileNotFoundError ) :
with open ( ' temp_pics_source_list.txt ' , ' w ' ) as f :
2021-04-16 01:26:42 +00:00
json . dump ( temp_pics_source_list , f )
2021-04-10 05:37:14 +00:00
2021-05-10 22:14:57 +00:00
# TODO still need to save these as csv files
2021-04-10 05:37:14 +00:00
return expanded_class , expanded_dropd
2021-02-14 08:34:56 +00:00
2021-04-16 01:26:42 +00:00
def dl_pictures ( self , * args ) :
2021-04-02 18:08:56 +00:00
'''
2021-04-16 01:26:42 +00:00
Downloads pictures from api to local storage using temp_pics_source_list
2021-04-13 17:10:24 +00:00
and creates custom { source : target } dictionary as dict_pics
2021-04-02 18:08:56 +00:00
'''
2021-04-03 06:42:31 +00:00
2021-04-21 04:09:35 +00:00
try :
with open ( ' target_dirs.txt ' , ' r+ ' ) as f : # TODO you can add option to change directory here, too. Look up how to have optional arguments
2021-04-10 05:37:14 +00:00
target_dir = json . load ( f )
2021-04-28 05:42:55 +00:00
except ( ValueError , FileNotFoundError ) :
2021-05-10 22:14:57 +00:00
target_dir = input ( ' No target dirctory found. Create One? [y] or [n]: ' )
2021-04-21 04:09:35 +00:00
if target_dir == ( ' y ' or ' Y ' ) :
2021-05-10 22:14:57 +00:00
target_dir = input ( ' Please provide full URL to destination folder: ' ) # TODO need to catch human syntax errors here
with open ( ' target_dirs.txt ' , ' w ' ) as f :
2021-04-21 04:09:35 +00:00
json . dump ( target_dir , f )
else :
2021-05-10 22:14:57 +00:00
os . mkdir ( os . getcwd ( ) + os . sep + ' training_images ' )
target_dir = os . getcwd ( ) + os . sep + ' training_images '
with open ( ' target_dirs.txt ' , ' w ' ) as f :
2021-04-13 17:10:24 +00:00
json . dump ( target_dir , f )
2021-04-28 05:42:55 +00:00
print ( ' Creating default folder in current directory @ ' + target_dir )
2021-04-13 17:10:24 +00:00
2021-04-16 01:26:42 +00:00
with open ( ' temp_pics_source_list.txt ' ) as f :
try :
if args :
2021-04-21 04:09:35 +00:00
temp_pics_source_list = args [ 0 ]
2021-04-16 01:26:42 +00:00
else :
temp_pics_source_list = json . load ( f )
2021-04-28 05:42:55 +00:00
except ( ValueError , FileNotFoundError ) :
2021-04-16 01:26:42 +00:00
if args :
2021-04-21 04:09:35 +00:00
temp_pics_sources_list = args [ 0 ]
2021-04-16 01:26:42 +00:00
else :
print ( ' url list not found. download aborted ' )
return
2021-04-24 22:23:47 +00:00
temp_dict_pics = { k : target_dir + os . sep + re . search ( r ' [^/]+(?=/ \ $_|.jpg) ' , k , re . IGNORECASE ) . group ( ) + ' .jpg ' for k in temp_pics_source_list }
2021-04-16 01:26:42 +00:00
2021-04-22 00:12:35 +00:00
try :
with open ( ' dict_pics.txt ' ) as f :
2021-04-16 01:26:42 +00:00
dict_pics = json . load ( f )
2021-05-10 22:14:57 +00:00
dict_pics . update ( temp_dict_pics ) # TODO This still creates duplicates
2021-04-22 00:12:35 +00:00
with open ( ' dict_pics.txt ' , ' w ' ) as f :
2021-04-24 22:23:47 +00:00
json . dump ( dict_pics , f )
2021-04-22 00:12:35 +00:00
except ( ValueError , FileNotFoundError ) :
2021-04-24 22:23:47 +00:00
with open ( ' dict_pics.txt ' , ' w ' ) as f :
json . dump ( temp_dict_pics , f )
dict_pics = temp_dict_pics
2021-04-10 05:37:14 +00:00
2021-04-24 22:23:47 +00:00
def dl_pic ( dict_pics , pic ) :
2021-04-10 05:37:14 +00:00
2021-04-16 01:26:42 +00:00
if os . path . exists ( dict_pics [ pic ] ) : # or call temp_dict_pics[pic] can work
2021-05-10 22:14:57 +00:00
pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
#TODO it would mean that temp_pics_source_list is changing for some reason?
2021-04-13 17:10:24 +00:00
2021-04-10 05:37:14 +00:00
else :
r = requests . get ( pic , stream = True )
r . raw . decode_content = True
2021-04-16 01:26:42 +00:00
with open ( temp_dict_pics [ pic ] , ' wb ' ) as f : # Or call dict_pics[pic] can work
2021-04-10 05:37:14 +00:00
shutil . copyfileobj ( r . raw , f )
2021-04-24 22:23:47 +00:00
bargs = [ ( dict_pics , pic ) for pic in temp_pics_source_list ]
2021-04-10 05:37:14 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
2021-04-21 04:09:35 +00:00
for future in executor . map ( lambda p : dl_pic ( * p ) , bargs ) :
2021-04-10 05:37:14 +00:00
future
2021-05-10 22:14:57 +00:00
os . remove ( ' temp_pics_source_list.txt ' ) # Deletes file after downloads complete successfully
class PreProcessing :
'''
Includes methods for pre - processing training set input and labels in the
training set created from CurateData class . Whereas CurateData training
sets provided trimmed down data from the raw json response from the
ShoppingApi call and provided a bare minimum format for the dataframe to be
used in training , PreProcessing optimizes that dataframe for training and
includes methods for image manipulation , creating test / train / validation
splits , etc .
'''
def stt_training ( self , dict_pics , expanded_class , expanded_dropd ) :
'''
Source to target training . Replaces source image URL with target URL
determined by values in dict_pics variable .
'''
pass
2021-04-13 17:10:24 +00:00
2021-04-02 18:08:56 +00:00
# TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
# if not exists and append to master img download dict
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
2020-10-18 00:22:45 +00:00
def main ( ) :
'''
Main program creates / updates a csv file to use for ML training from live
ebay listings
'''
2021-01-25 05:46:55 +00:00
pass
# main goes here:
2020-10-12 07:53:29 +00:00
2020-10-18 00:22:45 +00:00
if __name__ == " __main__ " :
main ( )
2020-11-09 01:47:03 +00:00
2020-10-12 07:53:29 +00:00
# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
# More than enough data for your dataset.
2021-01-27 06:01:00 +00:00
# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which
# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1.
2020-11-09 01:47:03 +00:00
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
# TO AVOID HICCUPS WHEN CREATING DATASET
2021-04-16 01:26:42 +00:00
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF Shoe TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
2021-02-07 19:25:37 +00:00
# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)