needless files removed

2022-02-24 21:21:22 -07:00
parent 0bec7088c9
commit 597372cad3
1 changed files with 0 additions and 595 deletions
@@ -1,595 +0,0 @@
-import os
-from time import sleep
-from random import randint
-import scrape_ids
-from datetime import datetime, timedelta
-import dateutil
-from dateutil import parser
-import pytz
-import pdb
-from io import StringIO
-import numpy as np
-import concurrent.futures
-import json
-import requests
-import pandas as pd
-import config as cfg
-import shutil
-import re
-import urllib, base64
-
-from ebaysdk.exception import ConnectionError
-from ebaysdk.trading import Connection as Trading
-from ebaysdk.finding import Connection as Finding
-from ebaysdk.shopping import Connection as Shopping
-
-# renew oauth token for shopping api
-def getAuthToken():
-     AppSettings = {
-          'client_id': cfg.oauth["client_id"],
-          'client_secret':cfg.oauth["client_secret"],
-          'ruName':cfg.oauth["RuName"]
-          }
-
-     authHeaderData = AppSettings['client_id'] + ':' + AppSettings['client_secret']
-     encodedAuthHeader = base64.b64encode(str.encode(authHeaderData))
-     encodedAuthHeader = str(encodedAuthHeader)[2:len(str(encodedAuthHeader))-1]
-
-     headers = {
-          "Content-Type" : "application/x-www-form-urlencoded", # what is this?
-          "Authorization" : "Basic " + str(encodedAuthHeader)
-          }
-
-     body= {
-          "grant_type" : "client_credentials",
-          "redirect_uri" : AppSettings['ruName'],
-          "scope" : "https://api.ebay.com/oauth/api_scope"
-      }
-
-     data = urllib.parse.urlencode(body)
-
-     tokenURL = "https://api.ebay.com/identity/v1/oauth2/token"
-
-     response = requests.post(tokenURL, headers=headers, data=data).json()
-#     error = response['error_description'] #if errors
-     access_token = response['access_token']
-
-     with open('temp_oauth_token.txt', 'w') as f:
-         json.dump(access_token, f)
-
-     return access_token
-
-class FindingApi:
-    '''
-    Methods for accessing eBay's FindingApi services
-    '''
-
-    def __init__(self, service):
-        self.service = [
-            'findItemsAdvanced', 'findCompletedItems',
-            'findItemsByKeywords', 'findItemsIneBayStores', 'findItemsByCategory',
-            'findItemsByProduct'
-            ][service] # Currently using only index 4, i.e., service = 4
-
-    def get_data(self, category_id):
-
-        '''
-        Gets raw JSON data fom FindingApi service call. Currently being used to
-        get itemIDs from categories;
-        '''
-#        startTime = dateutil.parser.isoparse( startTime )
-#        now = datetime.datetime.now(tz=pytz.UTC)
-#        days_on_site = (now - startTime).days # as int
-
-        ids = []
-        params = {
-            "OPERATION-NAME":self.service,
-            "SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
-            "SERVICE-VERSION":"1.13.0",
-            "RESPONSE-DATA-FORMAT":"JSON",
-            "categoryId":category_id,
-            "paginationInput.entriesPerPage":"100",
-            "paginationInput.PageNumber":"1",
-            "itemFilter(0).name":"Condition",
-            "itemFilter(0).value":"Used",
-            "itemFilter.name":"HideDuplicateItems",
-            "itemFilter.value":"true",
-            "sortOrder":"StartTimeNewest",
-            }
-#            "itemFilter(1).name":"TopRatedSellerOnly", # TODO fix here
-#            "itemFilter(1).value":"true"
-
-        try:
-            response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
-                params=params, timeout=24)
-            response.raise_for_status()
-
-        except requests.exceptions.RequestException: # appears this works need to be able to continue where you left off or use better timeout?
-            print('connection error')
-            return ids
-        try:
-            data = response.json()
-            for item in data['findItemsByCategoryResponse'][0]['searchResult'][0]['item']:
-                ids.append(item['itemId'][0])
-
-            ids = list(set(ids))
-
-        except (AttributeError, KeyError):
-            print('AttributeError or KeyError. Exiting')
-            print(response.json())
-            return ids
-
-        return ids
-
-# TODO add some other options to finding call api such as for possibly filtering for used items only. This might give you a better dataset for training. Or maybe a mixture of new and used. Maybe 
-# try and come up with a way to mathematically determine your odds of maximizing the number of pictures in your training set while reducing the number of useless images. Say for example, if you took a
-# random set of 3 of 8 pictures total from each listing you might have a better chance of getting 3 good pictures in addition to increasing your training set. Or maybe you would have better luck with limiting
-# it to the first 5 pictures instead of random. 
-
-# You may even have more consistency with used shoes since they are "one-off" items without confusing multiple variations and colors. What else you can do is run small training sets on both new and used
-# to see which one is more accurate or if a combo of both is more accurate. 
-
-    def get_ids_from_cats(self):
-        '''
-        Creates a 20-itemId list to use for the ShoppingApi
-        call
-        '''
-
-        ids = []
-
-        # load category id list
-        with open('cat_list.txt') as jf:
-            cat_list = json.load(jf)
-
-        # load list of master ids
-        with open('master_ids.txt') as f:
-            master_ids = json.load(f)
-
-        # fetch ids with calls to Finding Api given cats as param
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(self.get_data, cat_list):
-                ids.extend(future)
-
-        # append master ids list with temporary ids from single function call and save
-        master_ids.extend(ids)
-        master_ids = list(set(master_ids))
-        with open('master_ids.txt', 'w') as f:
-            json.dump(master_ids, f)
-
- # 20-ItemID list created to maximize dataset/decrease calls provided call constraints
-        twenty_id_list = [','.join(ids[n:n+20]) for n in list(range(0,
-            len(ids), 20))]
-
-        return twenty_id_list, ids
-
-class ShoppingApi:
-    '''
-    Creates objects from ShoppingApi service calls that can interact with
-    pandas dataframes
-    '''
-
-#    def __init__(self):
-#        
-#        # renew oauth token
-#        oauth_response = getAuthToken()
-#        access_token = oauth_response[0]
-#        
-#        self.access_token = access_token
-
-    def update_cats(self):
-        '''
-        Updates cat_list.txt
-        '''
-
-        parent_cats = ['3034', '93427'] # Women's and Men's shoe departments
-        cat_list = []
-        # TODO make sep lists for women's and men's shoe cats. Needed to train
-        # mens and women's cats separately. This might improve val. acc. during training
-
-        with open('temp_oauth_token.txt') as f:
-            access_token = json.load(f)
-        for department in parent_cats:
-
-            headers = {
-                "X-EBAY-API-IAF-TOKEN":access_token,
-                "version":"671",
-                }
-
-            url = "https://open.api.ebay.com/shopping?&callname=GetCategoryInfo&responseencoding=JSON&IncludeSelector=ChildCategories&CategoryID="+department
-
-            try:
-                response = requests.get(url, headers=headers, timeout=4)
-                response.raise_for_status()
-
-            except requests.exceptions.RequestException:
-                print('connection error')
-
-            response = response.json()
-            parent_cat = response['CategoryArray']['Category'][0] 
-            response = response['CategoryArray']['Category'][1:] # excludes index 0 as this is parent node, i.e., women's or men's dept.
-
-            temp_cat_list = [cat['CategoryID'] for cat in response]
-            if parent_cat == '3034':
-                women_cats = temp_cat_list
-            elif parent_cat = '93427':
-                men_cats = temp_cat_list
-            cat_list.extend(temp_cat_list)
-
-        with open('cat_list.txt', 'w') as f:
-            json.dump(cat_list, f)
-        with open('women_cat_list.txt', 'w') as f:
-            json.dump(women_cats, f)
-        with open('men_cat_list.txt', 'w') as f:
-            json.dump(men_cats, f)
-
-    def get_item_from_findItemsByCategory(self, twenty_id):
-
-        '''
-       Gets raw JSON data from multiple live listings given multiple itemIds
-        '''
-        with open('temp_oauth_token.txt') as f:
-            access_token = json.load(f)
-
-        headers = {
-            "X-EBAY-API-IAF-TOKEN":access_token,
-            "version":"671",
-            }
-
-        url = "https://open.api.ebay.com/shopping?&callname=GetMultipleItems&responseencoding=JSON&IncludeSelector=ItemSpecifics&ItemID="+twenty_id
-
-        try:
-        
-            response = requests.get(url, headers=headers,timeout=24)
-            response.raise_for_status()
-            response = response.json()
-            item = response['Item']
-
-
-        except (requests.exceptions.RequestException, KeyError):
-            print('connection error. IP limit possibly exceeded')
-            print(response)
-            return # returns NoneType. Handled at conky()
-
-        return item
-
-    def conky(self, twenty_ids_list):
-        '''
-        Runs get_item_from_findItemsByCategory in multiple threads to get relevant
-        data for creating training sets
-        '''
-        try:
-            with open('raw_data.txt') as f:
-                data = json.load(f)
-        except (FileNotFoundError, ValueError):
-            data = []
-
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(self.get_item_from_findItemsByCategory, twenty_ids_list):
-                if future is not None:
-                    for item in future:
-                        data.append(item) # The end result should be a list of dicts where each dict in the list is a listing 
-                else:
-                    print('response is None')
-                    break
-        with open('raw_data.txt', 'w') as f:
-            json.dump(data, f)
-        return data
-
-# NOTE:
-
-# Limited to 5000 calls to shopping api per day, and getMultpileitems service maxes out at 20 items
-# per call leaving you 100,000 items per day for you pandas dataframe initially. So you'll have
-# to divide these up into the categories. This will leave you with about 6.25K results per cat.
-# More than enough data for your dataset.
-
-
-class CurateData:
-    '''
-    Contains methods for curating data for machine learning training sets;
-    Takes item in data from ShoppingApi request as argument and extracts/ creates key
-    value pairs that gets updated to custom dataframe used in Ml training sets.
-    '''
-
-    def import_raw(self):
-        '''
-        imports raw response json from local file. This is data from
-        GetMultipleItems call in ShoppingApi
-        '''
-        with open('raw_data.txt') as f:
-            raw_data = json.load(f)
-            return raw_data
-
-    def raw_df(self, raw_data): # TODO not dropping dupes, and is appending raw_data for some reason
-        '''
-        creates pandas df from raw json and saves master raw csv file as raw_df.csv.
-        Indended to be used inline with direct
-        data stream from ebay's APIs
-        '''
-        to_json = json.dumps(raw_data)
-        raw_df = pd.read_json(StringIO(to_json))
-        raw_df.to_csv('raw_df.csv') # NOTE not append mode because raw_df is made from the master raw_data.txt file
-        #raw_df = pd.read_csv('raw_df.csv', index_col=0)
-        #raw_df.drop_duplicates(subset=['ItemID']).reset_index(drop=True) # may not need this
-        #raw_df.to_csv('raw_df.csv')
-
-        # TODO still saving "Unnamed:0" column
-
-        return raw_df
-
-    def to_training(self, raw_data):
-        '''
-        creates first pass of potential labels for training set. This is the base
-        df used to produce other training sets to use.
-        '''
-        raw_df = self.raw_df(raw_data)
-        interm_df1 = raw_df.loc[:,['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
-        interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df1.loc[:, ['ItemID', 'PrimaryCategoryID']].astype(str)
-        training = interm_df1.dropna(subset=['ItemSpecifics'])
-        return training # TODO RENAME THIS FUNC AND its RETURN VALUE
-
-    def class_training(self, training):
-        '''Training set for multiclass portion of training set. Used to train
-        seprately from multilabel portion
-        '''
-        class_training = training.loc[:, ['PictureURL', 'PrimaryCategoryID']]
-        return class_training
-
-    def nvl_training(self, training):
-        '''
-        Training set for multilabel portion
-        '''
-        interm_df1 = pd.Series(training.ItemSpecifics)
-        interm_df1 = interm_df1.apply(lambda x: x['NameValueList'])
-
-        # Necessary for json_normalize(): 
-
-        nvl_dict = interm_df1.apply(lambda x: {k:v for (k, v) in zip([n['Name'] for n in x], [v['Value'] for v in x])})
-        nvl_df = pd.json_normalize(nvl_dict)
-        nvl_training = pd.concat([pd.Series(training.PictureURL), nvl_df], axis=1)
-
-        return nvl_training
-
-    def extract_df(self, df):
-        '''
-        converts single-value lists of strings of any df to string if not null
-        '''
-        extracted_df = df.applymap(lambda x: ' '.join(x) if isinstance(x, list) else np.nan if pd.isnull(x) else x)
-
-        return extracted_df
-
-    def drop_nvl_cols(self, nvl_training):
-
-        with open('cat_spacs.txt') as f:
-            cat_spacs = json.load(f)
-
-        drop = ['Year Manufactured', 'MPN', 'Platform Height', 'Product Line', 
-                'Personalize', 'Fabric Type', 'Customized','Release Year',
-                'Heel to Toe Drop', 'Midsole Type', 'Cleat Type', 'Handmade',
-                'Signed', 'Silhouette', 'Insole Material', 'Lining Material',
-                'California Prop 65 Warning', 'Character Family', 'Character',
-                'Cushioning Level', 'Personalization Instructions', 'Pronation',
-                ]
-        drop_2 = ['Calf Width', 'Theme', 'Outsole Material', 'Style Code', 'Features',
-                'EU Shoe Size', 'AU Shoe Size', 'Vintage', 'US Shoe Size', 
-                'Country/Region of Manufacture', 'Brand', 'Model']
-        for cat in drop :
-            if cat in cat_spacs:
-                cat_spacs.remove(cat)
-        for cat in drop_2:
-            if cat in cat_spacs:
-                cat_spacs.remove(cat)
-
-        user_input = input('drop cols? (y,n; default=y): ')
-
-        if 'n' in user_input:
-            dropd = nvl_training#.drop(col_drop, errors='ignore', axis=1) # errors='ignore' for non existent labels
-        else:
-            cols = []
-            for col in cat_spacs:
-                if col in list(nvl_training.columns):
-                    cols.append(col)
-            cols.insert(0, 'PictureURL') # list of other cols that aren't needed for training
-            dropd = nvl_training[cols]
-
-        return dropd
-
-# for future reference, to deal with inconsistent values in the nvl (due to sellers inputting custom values in the fields) you can drop either listings or k/v pairs that are unique which 
-# can be determined from applying a function to determine frequency of k/v pairs--> list of unique k/v pairs--> function to determine frequency of unique k/v pairs--> drop those that have 1. 
-
-# Check the above list of cols I want to keep to see if there are duplicates with diff spelling and phrasing (e.g., Departement and Department, or Fastening and Closure Type)
-
-    def expand_nvlclass(self, class_training, dropd):
-        '''
-        takes image url list from each cell and expands them into separate/duplicate
-        instances. Modifies both class training and dropd dfs. Appends custom
-        image url dict {'source':'target'}.
-        * consider applying this function to other cells that have multiple values in their lists
-        '''
-        expand = input("expand image list or use primary listing image? (y or n): ")
-        if ('y' or 'Y') in expand:
-            expanded_class = class_training.explode('PictureURL').reset_index(drop=True)
-            expanded_class = expanded_class.dropna(subset=['PictureURL'])
-            expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-
-            expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True)
-            expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
-            expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
-
-            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
-
-            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list())) 
-
-        else:
-            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
-            expanded_class = class_training.dropna()
-            dropd = dropd.dropna(subset=['PictureURL'])
-            dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
-            dropd = dropd.dropna(subset=['PictureURL'])
-            expanded_dropd = dropd
-
-            expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
-
-            # retrieves picture URLs from master raw_data.txt and rewrites temp_pics_source_list.txt
-            temp_pics_source_list = list(set(expanded_class.PictureURL.to_list()))
-
-        try:
-            with open('temp_pics_source_list.txt') as f:
-                tpsl = json.load(f)
-                tpsl.extend(temp_pics_source_list)
-
-                # ensures no duplicate source URLs exist
-                temp_pics_source_list = list(set(tpsl))
-                with open('temp_pics_source_list.txt', 'w') as f:
-                    json.dump(temp_pics_source_list, f)
-
-        # creates file if script is ran for 1st time and file not present
-        except (ValueError, FileNotFoundError):
-            with open('temp_pics_source_list.txt', 'w') as f:
-                json.dump(temp_pics_source_list, f)
-
-        # Append to master training dataframes, drop potential dupes and save
-        expanded_class.to_csv('expanded_class.csv')
-        expanded_dropd.to_csv('expanded_dropd.csv')
-
-        return expanded_class, expanded_dropd
-
-    def dl_pic(self,dict_pics, pic):
-
-        try:
-
-            # check if image exists in current working directory. avoids dupes
-            if os.path.exists(dict_pics[pic]):
-                pass
-
-            else:
-
-                try:
-
-                    r = requests.get(pic, stream=True)
-                    r.raw.decode_content = True
-                    with open(dict_pics[pic], 'wb') as  f:
-                        shutil.copyfileobj(r.raw, f)
-
-                except ConnectionError:
-                    return
-
-        except KeyError:
-            pass
-
-    def dict_pics(self):
-
-        try:
-            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
-                target_dir = json.load(f)
-
-        except (ValueError, FileNotFoundError):
-            target_dir = input('No target dirctory found. Create One? [y] or [n]:')
-            if target_dir == ('y' or 'Y'):
-                target_dir = input('Please provide full URL to destination folder:') # TODO need to catch human syntax errors here
-                with open('target_dirs.txt','w') as f:
-                    json.dump(target_dir, f)
-                
-            else:
-                os.mkdir(os.getcwd()+os.sep+'training_images')
-                target_dir = os.getcwd()+os.sep+'training_images'
-                with open('target_dirs.txt','w') as f:
-                    json.dump(target_dir, f)
-                    print('Creating default folder in current directory @ ' + target_dir)
-
-        # open url list in working directory
-        with open('temp_pics_source_list.txt') as f:
-            
-            try:
-                temp_pics_source_list = json.load(f)
-
-            except (ValueError, FileNotFoundError):
-                print('url list not found. aborting')
-                return
-
-        dict_pics = {}
-
-        # make custom dict, {source:target}, and name images from unique URL patt
-        for k in temp_pics_source_list:
-            patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE)
-            patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE) 
-            if patt_1 and patt_2 is not None:
-                tag = patt_1.group() + patt_2.group().lower()
-                file_name = target_dir + os.sep + tag
-                dict_pics.update({k:file_name})
-
-        with open('dict_pics.txt', 'w') as f:
-            json.dump(dict_pics, f)
-
-        return dict_pics # TODO still need to find sol to outliers (aka, naming scheme for unusual source URLs)
-
-    def dl_pictures(self, *dict_pics):
-        '''
-        Downloads pictures from api to local storage using temp_pics_source_list
-        and dict_pics
-        '''
-
-        if not dict_pics:
-            dict_pics = self.dict_pics()
-
-        with open('temp_pics_source_list.txt') as f:
-            try:
-                temp_pics_source_list = json.load(f)
-            except (ValueError, FileNotFoundError):
-                print('url list not found. download aborted')
-                return
-
-        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(lambda p: self.dl_pic(*p), bargs):
-                if future is not None:
-                    future
-                else:
-                    print('connection error')
-
-class PreProcessing:
-    '''
-    Includes methods for pre-processing training set input and labels in the
-    training set created from CurateData class. Whereas CurateData training
-    sets provided trimmed down data from the raw json response from the
-    ShoppingApi call and provided a bare minimum format for the dataframe to be
-    used in training, PreProcessing optimizes that dataframe for training and
-    includes methods for image manipulation, creating test/train/validation
-    splits, etc.
-    '''
-
-    def dict_pics(self):
-        '''
-        Source to target training. Replaces source image URL with target URL
-        determined by values in dict_pics variable.
-        '''
-        
-        target_dir = os.getcwd()
-        with open('temp_pics_source_list.txt') as f:
-            temp_pics_source_list = json.load(f)
-        dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
-        print("{source:target} dictionary created @ " + os.getcwd() + os.sep + 'training_images')
-        return dict_pics
-
-        # TODO pipeline gameplan: 5 files: dict_pics.txt,raw_json.txt, raw_json.csv, expanded_class.csv, expanded_dropd.csv
-        # cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
-        # if not exists and append to master img download dict
-        # --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
-
-def main():
-    '''
-    Main program creates/updates a csv file to use for ML training from live
-    ebay listings
-    '''
-    pass
-# main goes here:
-
-if __name__ == "__main__":
-    main()
-
-'''
-Based on your sample set of 10 images, if you have an average of 5 images per
-listing and you download a hundred listings, you will have about 102 Gb of
-image data. That's just for one day. If you have more than a million listings
-you're looking at a little over 1Tb of image data. You don't even know if this
-is good data yet.
-'''