From 597372cad3543dec59517bd32412d652ccbeea12 Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Thu, 24 Feb 2022 21:21:22 -0700 Subject: [PATCH] needless files removed --- :w | 595 ------------------------------------------------------------- 1 file changed, 595 deletions(-) delete mode 100644 :w diff --git a/:w b/:w deleted file mode 100644 index efc95f6..0000000 --- a/:w +++ /dev/null @@ -1,595 +0,0 @@ -import os -from time import sleep -from random import randint -import scrape_ids -from datetime import datetime, timedelta -import dateutil -from dateutil import parser -import pytz -import pdb -from io import StringIO -import numpy as np -import concurrent.futures -import json -import requests -import pandas as pd -import config as cfg -import shutil -import re -import urllib, base64 - -from ebaysdk.exception import ConnectionError -from ebaysdk.trading import Connection as Trading -from ebaysdk.finding import Connection as Finding -from ebaysdk.shopping import Connection as Shopping - -# renew oauth token for shopping api -def getAuthToken(): - AppSettings = { - 'client_id': cfg.oauth["client_id"], - 'client_secret':cfg.oauth["client_secret"], - 'ruName':cfg.oauth["RuName"] - } - - authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret'] - encodedAuthHeader = base64.b64encode(str.encode(authHeaderData)) - encodedAuthHeader = str(encodedAuthHeader)[2:len(str(encodedAuthHeader))-1] - - headers = { - "Content-Type" : "application/x-www-form-urlencoded", # what is this? - "Authorization" : "Basic " + str(encodedAuthHeader) - } - - body= { - "grant_type" : "client_credentials", - "redirect_uri" : AppSettings['ruName'], - "scope" : "https://api.ebay.com/oauth/api_scope" - } - - data = urllib.parse.urlencode(body) - - tokenURL = "https://api.ebay.com/identity/v1/oauth2/token" - - response = requests.post(tokenURL, headers=headers, data=data).json() -# error = response['error_description'] #if errors - access_token = response['access_token'] - - with open('temp_oauth_token.txt', 'w') as f: - json.dump(access_token, f) - - return access_token - -class FindingApi: - ''' - Methods for accessing eBay's FindingApi services - ''' - - def __init__(self, service): - self.service = [ - 'findItemsAdvanced', 'findCompletedItems', - 'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory', - 'findItemsByProduct' - ][service] # Currently using only index 4, i.e., service = 4 - - def get_data(self, category_id): - - ''' - Gets raw JSON data fom FindingApi service call. Currently being used to - get itemIDs from categories; - ''' -# startTime = dateutil.parser.isoparse( startTime ) -# now = datetime.datetime.now(tz=pytz.UTC) -# days_on_site = (now - startTime).days # as int - - ids = [] - params = { - "OPERATION-NAME":self.service, - "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'], - "SERVICE-VERSION":"1.13.0", - "RESPONSE-DATA-FORMAT":"JSON", - "categoryId":category_id, - "paginationInput.entriesPerPage":"100", - "paginationInput.PageNumber":"1", - "itemFilter(0).name":"Condition", - "itemFilter(0).value":"Used", - "itemFilter.name":"HideDuplicateItems", - "itemFilter.value":"true", - "sortOrder":"StartTimeNewest", - } -# "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here -# "itemFilter(1).value":"true" - - try: - response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1", - params=params, timeout=24) - response.raise_for_status() - - except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout? - print('connection error') - return ids - try: - data = response.json() - for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']: - ids.append(item['itemId'][0]) - - ids = list(set(ids)) - - except (AttributeError, KeyError): - print('AttributeError or KeyError. Exiting') - print(response.json()) - return ids - - return ids - -# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe -# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a -# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting -# it to the first 5 pictures instead of random. - -# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used -# to see which one is more accurate or if a combo of both is more accurate. - - def get_ids_from_cats(self): - ''' - Creates a 20-itemId list to use for the ShoppingApi - call - ''' - - ids = [] - - # load category id list - with open('cat_list.txt') as jf: - cat_list = json.load(jf) - - # load list of master ids - with open('master_ids.txt') as f: - master_ids = json.load(f) - - # fetch ids with calls to Finding Api given cats as param - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(self.get_data, cat_list): - ids.extend(future) - - # append master ids list with temporary ids from single function call and save - master_ids.extend(ids) - master_ids = list(set(master_ids)) - with open('master_ids.txt', 'w') as f: - json.dump(master_ids, f) - - # 20-ItemID list created to maximize dataset/decrease calls provided call constraints - twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0, - len(ids), 20))] - - return twenty_id_list, ids - -class ShoppingApi: - ''' - Creates objects from ShoppingApi service calls that can interact with - pandas dataframes - ''' - -# def __init__(self): -# -# # renew oauth token -# oauth_response = getAuthToken() -# access_token = oauth_response[0] -# -# self.access_token = access_token - - def update_cats(self): - ''' - Updates cat_list.txt - ''' - - parent_cats = ['3034', '93427'] # Women's and Men's shoe departments - cat_list = [] - # TODO make sep lists for women's and men's shoe cats. Needed to train - # mens and women's cats separately. This might improve val. acc. during training - - with open('temp_oauth_token.txt') as f: - access_token = json.load(f) - for department in parent_cats: - - headers = { - "X-EBAY-API-IAF-TOKEN":access_token, - "version":"671", - } - - url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department - - try: - response = requests.get(url, headers=headers, timeout=4) - response.raise_for_status() - - except requests.exceptions.RequestException: - print('connection error') - - response = response.json() - parent_cat = response['CategoryArray']['Category'][0] - response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept. - - temp_cat_list = [cat['CategoryID'] for cat in response] - if parent_cat == '3034': - women_cats = temp_cat_list - elif parent_cat = '93427': - men_cats = temp_cat_list - cat_list.extend(temp_cat_list) - - with open('cat_list.txt', 'w') as f: - json.dump(cat_list, f) - with open('women_cat_list.txt', 'w') as f: - json.dump(women_cats, f) - with open('men_cat_list.txt', 'w') as f: - json.dump(men_cats, f) - - def get_item_from_findItemsByCategory(self, twenty_id): - - ''' - Gets raw JSON data from multiple live listings given multiple itemIds - ''' - with open('temp_oauth_token.txt') as f: - access_token = json.load(f) - - headers = { - "X-EBAY-API-IAF-TOKEN":access_token, - "version":"671", - } - - url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id - - try: - - response = requests.get(url, headers=headers,timeout=24) - response.raise_for_status() - response = response.json() - item = response['Item'] - - - except (requests.exceptions.RequestException, KeyError): - print('connection error. IP limit possibly exceeded') - print(response) - return # returns NoneType. Handled at conky() - - return item - - def conky(self, twenty_ids_list): - ''' - Runs get_item_from_findItemsByCategory in multiple threads to get relevant - data for creating training sets - ''' - try: - with open('raw_data.txt') as f: - data = json.load(f) - except (FileNotFoundError, ValueError): - data = [] - - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list): - if future is not None: - for item in future: - data.append(item) # The end result should be a list of dicts where each dict in the list is a listing - else: - print('response is None') - break - with open('raw_data.txt', 'w') as f: - json.dump(data, f) - return data - -# NOTE: - -# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items -# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have -# to divide these up into the categories. This will leave you with about 6.25K results per cat. -# More than enough data for your dataset. - - -class CurateData: - ''' - Contains methods for curating data for machine learning training sets; - Takes item in data from ShoppingApi request as argument and extracts/ creates key - value pairs that gets updated to custom dataframe used in Ml training sets. - ''' - - def import_raw(self): - ''' - imports raw response json from local file. This is data from - GetMultipleItems call in ShoppingApi - ''' - with open('raw_data.txt') as f: - raw_data = json.load(f) - return raw_data - - def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason - ''' - creates pandas df from raw json and saves master raw csv file as raw_df.csv. - Indended to be used inline with direct - data stream from ebay's APIs - ''' - to_json = json.dumps(raw_data) - raw_df = pd.read_json(StringIO(to_json)) - raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file - #raw_df = pd.read_csv('raw_df.csv', index_col=0) - #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this - #raw_df.to_csv('raw_df.csv') - - # TODO still saving "Unnamed:0" column - - return raw_df - - def to_training(self, raw_data): - ''' - creates first pass of potential labels for training set. This is the base - df used to produce other training sets to use. - ''' - raw_df = self.raw_df(raw_data) - interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']] - interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str) - training = interm_df1.dropna(subset=['ItemSpecifics']) - return training # TODO RENAME THIS FUNC AND its RETURN VALUE - - def class_training(self, training): - '''Training set for multiclass portion of training set. Used to train - seprately from multilabel portion - ''' - class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']] - return class_training - - def nvl_training(self, training): - ''' - Training set for multilabel portion - ''' - interm_df1 = pd.Series(training.ItemSpecifics) - interm_df1 = interm_df1.apply(lambda x: x['NameValueList']) - - # Necessary for json_normalize(): - - nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])}) - nvl_df = pd.json_normalize(nvl_dict) - nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1) - - return nvl_training - - def extract_df(self, df): - ''' - converts single-value lists of strings of any df to string if not null - ''' - extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x) - - return extracted_df - - def drop_nvl_cols(self, nvl_training): - - with open('cat_spacs.txt') as f: - cat_spacs = json.load(f) - - drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', - 'Personalize', 'Fabric Type', 'Customized','Release Year', - 'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade', - 'Signed', 'Silhouette', 'Insole Material', 'Lining Material', - 'California Prop 65 Warning', 'Character Family', 'Character', - 'Cushioning Level', 'Personalization Instructions', 'Pronation', - ] - drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features', - 'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', - 'Country/Region of Manufacture', 'Brand', 'Model'] - for cat in drop : - if cat in cat_spacs: - cat_spacs.remove(cat) - for cat in drop_2: - if cat in cat_spacs: - cat_spacs.remove(cat) - - user_input = input('drop cols? (y,n; default=y): ') - - if 'n' in user_input: - dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels - else: - cols = [] - for col in cat_spacs: - if col in list(nvl_training.columns): - cols.append(col) - cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training - dropd = nvl_training[cols] - - return dropd - -# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which -# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. - -# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type) - - def expand_nvlclass(self, class_training, dropd): - ''' - takes image url list from each cell and expands them into separate/duplicate - instances. Modifies both class training and dropd dfs. Appends custom - image url dict {'source':'target'}. - * consider applying this function to other cells that have multiple values in their lists - ''' - expand = input("expand image list or use primary listing image? (y or n): ") - if ('y' or 'Y') in expand: - expanded_class = class_training.explode('PictureURL').reset_index(drop=True) - expanded_class = expanded_class.dropna(subset=['PictureURL']) - expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - - expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) - expanded_dropd = expanded_dropd.dropna(subset=['PictureURL']) - expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True) - - expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values - - temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) - - else: - class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) - expanded_class = class_training.dropna() - dropd = dropd.dropna(subset=['PictureURL']) - dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) - dropd = dropd.dropna(subset=['PictureURL']) - expanded_dropd = dropd - - expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values - - # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt - temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) - - try: - with open('temp_pics_source_list.txt') as f: - tpsl = json.load(f) - tpsl.extend(temp_pics_source_list) - - # ensures no duplicate source URLs exist - temp_pics_source_list = list(set(tpsl)) - with open('temp_pics_source_list.txt', 'w') as f: - json.dump(temp_pics_source_list, f) - - # creates file if script is ran for 1st time and file not present - except (ValueError, FileNotFoundError): - with open('temp_pics_source_list.txt', 'w') as f: - json.dump(temp_pics_source_list, f) - - # Append to master training dataframes, drop potential dupes and save - expanded_class.to_csv('expanded_class.csv') - expanded_dropd.to_csv('expanded_dropd.csv') - - return expanded_class, expanded_dropd - - def dl_pic(self,dict_pics, pic): - - try: - - # check if image exists in current working directory. avoids dupes - if os.path.exists(dict_pics[pic]): - pass - - else: - - try: - - r = requests.get(pic, stream=True) - r.raw.decode_content = True - with open(dict_pics[pic], 'wb') as f: - shutil.copyfileobj(r.raw, f) - - except ConnectionError: - return - - except KeyError: - pass - - def dict_pics(self): - - try: - with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments - target_dir = json.load(f) - - except (ValueError, FileNotFoundError): - target_dir = input('No target dirctory found. Create One? [y] or [n]:') - if target_dir == ('y' or 'Y'): - target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here - with open('target_dirs.txt','w') as f: - json.dump(target_dir, f) - - else: - os.mkdir(os.getcwd()+os.sep+'training_images') - target_dir = os.getcwd()+os.sep+'training_images' - with open('target_dirs.txt','w') as f: - json.dump(target_dir, f) - print('Creating default folder in current directory @ ' + target_dir) - - # open url list in working directory - with open('temp_pics_source_list.txt') as f: - - try: - temp_pics_source_list = json.load(f) - - except (ValueError, FileNotFoundError): - print('url list not found. aborting') - return - - dict_pics = {} - - # make custom dict, {source:target}, and name images from unique URL patt - for k in temp_pics_source_list: - patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE) - patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE) - if patt_1 and patt_2 is not None: - tag = patt_1.group() + patt_2.group().lower() - file_name = target_dir + os.sep + tag - dict_pics.update({k:file_name}) - - with open('dict_pics.txt', 'w') as f: - json.dump(dict_pics, f) - - return dict_pics # TODO still need to find sol to outliers (aka, naming scheme for unusual source URLs) - - def dl_pictures(self, *dict_pics): - ''' - Downloads pictures from api to local storage using temp_pics_source_list - and dict_pics - ''' - - if not dict_pics: - dict_pics = self.dict_pics() - - with open('temp_pics_source_list.txt') as f: - try: - temp_pics_source_list = json.load(f) - except (ValueError, FileNotFoundError): - print('url list not found. download aborted') - return - - bargs = [(dict_pics, pic) for pic in temp_pics_source_list] - with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(lambda p: self.dl_pic(*p), bargs): - if future is not None: - future - else: - print('connection error') - -class PreProcessing: - ''' - Includes methods for pre-processing training set input and labels in the - training set created from CurateData class. Whereas CurateData training - sets provided trimmed down data from the raw json response from the - ShoppingApi call and provided a bare minimum format for the dataframe to be - used in training, PreProcessing optimizes that dataframe for training and - includes methods for image manipulation, creating test/train/validation - splits, etc. - ''' - - def dict_pics(self): - ''' - Source to target training. Replaces source image URL with target URL - determined by values in dict_pics variable. - ''' - - target_dir = os.getcwd() - with open('temp_pics_source_list.txt') as f: - temp_pics_source_list = json.load(f) - dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} - print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images') - return dict_pics - - # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv - # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures - # if not exists and append to master img download dict - # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted - -def main(): - ''' - Main program creates/updates a csv file to use for ML training from live - ebay listings - ''' - pass -# main goes here: - -if __name__ == "__main__": - main() - -''' -Based on your sample set of 10 images, if you have an average of 5 images per -listing and you download a hundred listings, you will have about 102 Gb of -image data. That's just for one day. If you have more than a million listings -you're looking at a little over 1Tb of image data. You don't even know if this -is good data yet. -'''