changing custom dict to dl_pictures function and making temp_dict_pics.txt file with target_dirs.txt
This commit is contained in:
parent
ef237b8a1b
commit
fb327a9375
@ -15,7 +15,6 @@ expanded_dfs = curate.expand_nvlclass(class_training, dropd)
|
||||
|
||||
expanded_class = expanded_dfs[0]
|
||||
expanded_dropd = expanded_dfs[1]
|
||||
dict_pics = expanded_dfs[2]
|
||||
# dict_pics = expanded_dfs[2]
|
||||
|
||||
# TODO # need to replace expanded df's PictureURL col values with destination urls
|
||||
# TODO # still have the problem of duplicate listings. Possibly take care of this before you run curate
|
||||
|
106
ebay_api.py
106
ebay_api.py
@ -1,11 +1,11 @@
|
||||
import importlib
|
||||
import os
|
||||
import numpy as np
|
||||
import concurrent.futures
|
||||
import json
|
||||
import requests
|
||||
import pandas as pd
|
||||
import config as cfg
|
||||
import wget # NOTE may not need this
|
||||
import shutil
|
||||
import re
|
||||
|
||||
@ -125,7 +125,7 @@ class ShoppingApi:
|
||||
try:
|
||||
response = requests.get("https://open.api.ebay.com/shopping?", params=params, timeout=1)
|
||||
response.raise_for_status()
|
||||
except request.exceptions.RequestException:
|
||||
except requests.exceptions.RequestException:
|
||||
print('connection error')
|
||||
response = response.json()
|
||||
response = response['Item']
|
||||
@ -251,14 +251,14 @@ class CurateData:
|
||||
dropd = nvl_training.drop(col_drop, axis=1)
|
||||
return dropd
|
||||
|
||||
def combine_nvlclass(self, class_training, dropd):
|
||||
final_training = pd.concat([class_training, dropd], axis=1)
|
||||
return final_training # TODO might not need this function
|
||||
|
||||
def make_dict_pics(self, expanded_class_training):
|
||||
with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
|
||||
dict_pics = json.load(jf)
|
||||
dict_pics.extend('<')
|
||||
# def combine_nvlclass(self, class_training, dropd):
|
||||
# final_training = pd.concat([class_training, dropd], axis=1)
|
||||
# return final_training # TODO might not need this function
|
||||
#
|
||||
# def make_dict_pics(self, expanded_class_training):
|
||||
# with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
|
||||
# dict_pics = json.load(jf)
|
||||
# dict_pics.extend('<')
|
||||
|
||||
|
||||
def expand_nvlclass(self, class_training, dropd):
|
||||
@ -269,54 +269,96 @@ class CurateData:
|
||||
'''
|
||||
expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
|
||||
expanded_class = expanded_class.dropna(subset=['PictureURL'])
|
||||
expanded_class = expanded_class.drop_duplicates(subset=['PictureURL'])
|
||||
expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
expanded_class = expanded_class.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
|
||||
expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
|
||||
expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL'])
|
||||
expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL']).reset_index(drop=True)
|
||||
# expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
|
||||
|
||||
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
|
||||
|
||||
dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
|
||||
destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only
|
||||
# defined in the download function
|
||||
dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
|
||||
expanded_class = expanded_class
|
||||
# with open('dict_pics.txt','w+') as f: # TODO open if it exists, or write if not, then extend the dictionary with dict_pics
|
||||
|
||||
# '''Will use temp_dict_pics for changing the training set at preprocessing'''
|
||||
# temp_dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
|
||||
# # TODO decide if the above is necesssary at this point or if it should
|
||||
# # be created at preprocessing or download
|
||||
#
|
||||
# with open('dict_pics.txt', 'w') as f:
|
||||
# try:
|
||||
# dict_pics = json.load(f)
|
||||
# dict_pics.update(temp_dict_pics)
|
||||
# json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions
|
||||
#
|
||||
# except ValueError:
|
||||
# json.dump(temp_dict_pics, f)
|
||||
|
||||
return expanded_class, expanded_dropd, dict_pics # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
|
||||
with open('dict_pics_list.txt', 'a+') as f: # Temp iterable for use w/executor
|
||||
try:
|
||||
dict_pics_list = json.load(f)
|
||||
dict_pics_list.append(dict_pics_list)
|
||||
json.dump(dict_pics_list, f)
|
||||
|
||||
except ValueError:
|
||||
json.dump(dict_pics_list, f)
|
||||
|
||||
return expanded_class, expanded_dropd
|
||||
|
||||
def dl_pictures(self):
|
||||
'''
|
||||
Downloads pictures from api to local storage using custom master dict
|
||||
'''
|
||||
|
||||
with open('dict_pic.txt', 'w+') as jf: # avoid duplicate logic goes here... I think
|
||||
with open('target_dirs.txt', 'a+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
|
||||
try:
|
||||
target_dir = json.load(f)
|
||||
except ValueError:
|
||||
target_dir = input('No default directory found. Create One? [y] or [n]:')
|
||||
with open('dict_pics.txt') as jf:
|
||||
dict_pics = json.load(jf)
|
||||
|
||||
r = requests.get('<dict_pic>', stream=True)
|
||||
r.raw.decode_content = True
|
||||
filename = '<your destination + naming schem.jpg>'
|
||||
with open(filename, 'wb') as f:
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
with open('dict_pics_list.txt') as f:
|
||||
dict_pics_list = json.load(f)
|
||||
|
||||
# NOTE consider adding this dl_pictures func inside another func that uses
|
||||
# threading to fund the dl_pictures func here somewhere
|
||||
def dl_pic(pic):
|
||||
|
||||
# PictureURL in PictureURL list can't be downloaded....have to use indirect address in the form https://i.ebayimg.com/images/g/<unique code>/s-l<size>.jpg
|
||||
# in place of https://i.ebayimg.com/00/s/ODQwWDE2MDA=/z/<unique code>/$_1.JPG or use requests methods instead of wget and original PictureURL? yes, use requests
|
||||
if os.path.exists(dict_pics[pic]):
|
||||
pass
|
||||
else:
|
||||
r = requests.get(pic, stream=True)
|
||||
r.raw.decode_content = True
|
||||
with open(dict_pics[pic], 'wb') as f: # might not work?
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(dl_pic, dict_pics_list):
|
||||
future
|
||||
|
||||
with open('dict_pics_list.txt','w') as f:
|
||||
dict_pics_list = []
|
||||
json.dump(dict_pics_list, f)
|
||||
|
||||
temp_dict_pics = {k:target_dir+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
|
||||
# TODO decide if the above is necesssary at this point or if it should
|
||||
# be created at preprocessing or download
|
||||
|
||||
with open('dict_pics.txt', 'w') as f:
|
||||
try:
|
||||
dict_pics = json.load(f)
|
||||
dict_pics.update(temp_dict_pics)
|
||||
json.dump(dict_pics, f) # TODO This completely overwrites the old file. Fix to exclude corruptions
|
||||
|
||||
except ValueError:
|
||||
json.dump(temp_dict_pics, f)
|
||||
|
||||
# TODO pipeline gameplan: 5 files: master img download dict,raw_json.txt, raw_json.csv, master_class_training.csv, master_nvl_training.csv
|
||||
# cont... open raw_json.txt and append, same with csv --> process new data --> pull out image source+dest and expand new dfs for the additional pictures
|
||||
# if not exists and append to master img download dict
|
||||
# --> concat m_class_training df and m_nvl_training dfs with new data. Need to add inclusion tests for all files when opened and appended/concatted
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(download_function, master_url_dict):
|
||||
pass
|
||||
|
||||
def update_df(self, data): # TODO save raw df as csv file
|
||||
'''
|
||||
Creates training instances for dataset. picture_url_list expanded to
|
||||
|
Loading…
Reference in New Issue
Block a user