Added method to update category IDs var and txt file

This commit is contained in:
spbeach46 2021-05-10 15:14:57 -07:00
parent 6cfa798902
commit c8ab1b13d9

View File

@ -11,7 +11,10 @@ import shutil
import re
class FindingApi:
'''Methods for accessing eBays FindingApi services'''
'''
Methods for accessing eBay's FindingApi services
'''
def __init__(self, service, pageNumber):
self.service = [
'findItemsAdvanced', 'findCompletedItems',
@ -20,13 +23,45 @@ class FindingApi:
][service]
self.pageNumber = list(range(1, pageNumber)) # 77 pgs will give equal weights to cats given call constraints
# departments = ["3034","93427"] (womens and mens)
# examples of additional params you may want to add:
# 'itemFilter(0).value':'Used'
# 'itemFilter(1).name':'ListingType'
# 'itemFilter(1).value':'AuctionWithBIN'
def update_cats(self):
parent_cats = ['3034', '93427']
cat_list = []
for department in parent_cats:
params = {
"callname":"GetCategoryInfo",
"appid":cfg.sec['SECURITY-APPNAME'],
"version":"671",
"responseencoding":"JSON",
"CategoryID":department,
"IncludeSelector":"ChildCategories",
}
try:
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
response.raise_for_status()
except requests.exceptions.RequestException:
print('connection error')
response = response.json()
response = response['CategoryArray']['Category'][1:]
temp_cat_list = [cat['CategoryID'] for cat in response]
cat_list.extend(temp_cat_list)
with open('cat_list.txt', 'w') as f:
json.dump(cat_list, f)
# leaf_list = [node['LeafCategory'] for node in response]
return cat_list
def get_data(self, category_id, i):
'''
@ -127,7 +162,7 @@ class ShoppingApi:
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
response.raise_for_status()
except requests.exceptions.RequestException:
except requests.exceptions.RequestException: # TODO need better handling
print('connection error')
response = response.json()
@ -137,8 +172,8 @@ class ShoppingApi:
def conky(self):
'''
For some reason item_id_results can only be passed as argument in executor.map
if the variable is made within function
Runs get_item_from_findItemsByCategory in multiple threads to get relevant
data for creating training sets
'''
data = []
finding = FindingApi(4, 2) # TODO replace these test values before production
@ -157,7 +192,7 @@ class ShoppingApi:
class CurateData:
'''
Contains functions for curating data for machine learning training sets;
Contains methods for curating data for machine learning training sets;
Takes item in data from ShoppingApi request as argument and extracts/ creates key
value pairs that gets updated to custom dataframe used in Ml training sets.
'''
@ -220,7 +255,7 @@ class CurateData:
return extracted_df
def drop_nvl_cols(self, nvl_training):
def drop_nvl_cols(self, nvl_training): # NOTE this is wonky
col_drop = [
'Fabric Type', 'Type of Sport', 'Mid Sole', 'Modified Item',
'Modification Description', 'Article Type', 'Customized',
@ -281,15 +316,16 @@ class CurateData:
try:
with open('temp_pics_source_list.txt') as f:
temp_pics_source_list = json.load(f)
temp_pics_source_list.append(temp_pics_source_list)
temp_pics_source_list = list(set(temp_pics_source_list))
tpsl = json.load(f)
tpsl.extend(temp_pics_source_list)
temp_pics_source_list = list(set(tpsl))
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
except (ValueError, FileNotFoundError):
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
# TODO still need to save these as csv files
return expanded_class, expanded_dropd
def dl_pictures(self, *args):
@ -301,16 +337,16 @@ class CurateData:
try:
with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
target_dir = json.load(f)
except (ValueError, FileNotFoundError):
target_dir = input('No target dirctory found. Create One? [y] or [n]:')
target_dir = input('No target dirctory found. Create One? [y] or [n]:')
if target_dir == ('y' or 'Y'):
target_dir = input('Please provide full URL to destination folder:')
with open('target_dirs.txt','w+') as f:
target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
with open('target_dirs.txt','w') as f:
json.dump(target_dir, f)
else:
target_dir = os.mkdir(os.getcwd()+os.sep+'training_images')
with open('target_dirs.txt','w+') as f:
os.mkdir(os.getcwd()+os.sep+'training_images')
target_dir = os.getcwd()+os.sep+'training_images'
with open('target_dirs.txt','w') as f:
json.dump(target_dir, f)
print('Creating default folder in current directory @ ' + target_dir)
@ -332,7 +368,7 @@ class CurateData:
try:
with open('dict_pics.txt') as f:
dict_pics = json.load(f)
dict_pics.update(temp_dict_pics)
dict_pics.update(temp_dict_pics) # TODO This still creates duplicates
with open('dict_pics.txt', 'w') as f:
json.dump(dict_pics, f)
@ -344,7 +380,8 @@ class CurateData:
def dl_pic(dict_pics, pic):
if os.path.exists(dict_pics[pic]): # or call temp_dict_pics[pic] can work
pass
pass # TODO This is not catching duplicates for some reason....possibly not? Upon inspection, files aren't duplicates...but why?
#TODO it would mean that temp_pics_source_list is changing for some reason?
else:
r = requests.get(pic, stream=True)
@ -352,16 +389,31 @@ class CurateData:
with open(temp_dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
shutil.copyfileobj(r.raw, f)
breakpoint()
bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(lambda p: dl_pic(*p), bargs):
future
with open('temp_pics_source_list.txt','w') as f: # Overwrites old when complete
temp_pics_source_list = []
json.dump(temp_pics_source_list, f)
os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully
class PreProcessing:
'''
Includes methods for pre-processing training set input and labels in the
training set created from CurateData class. Whereas CurateData training
sets provided trimmed down data from the raw json response from the
ShoppingApi call and provided a bare minimum format for the dataframe to be
used in training, PreProcessing optimizes that dataframe for training and
includes methods for image manipulation, creating test/train/validation
splits, etc.
'''
def stt_training(self, dict_pics, expanded_class, expanded_dropd):
'''
Source to target training. Replaces source image URL with target URL
determined by values in dict_pics variable.
'''
pass
# TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures