new dl_pic, dict_pics methods and updated dl_pictures. fixed bugs

This commit is contained in:
scott 2022-01-07 18:28:37 -07:00
parent ae25ab88b6
commit 8c41fe1daf
4 changed files with 68 additions and 41 deletions

View File

@ -93,7 +93,7 @@
}
],
"source": [
"def dict_pics():\n",
"def dict_pics_jup():\n",
" target_dir = os.getcwd() + os.sep + \"training_images\"\n",
" with open('temp_pics_source_list.txt') as f:\n",
" temp_pics_source_list = json.load(f)\n",
@ -101,7 +101,7 @@
" print(\"{source:target} dictionary created @ \" + target_dir)\n",
" return dict_pics\n",
"\n",
"dict_pics = dict_pics()\n",
"dict_pics = dict_pics_jup()\n",
"blah = pd.Series(df.PictureURL)\n",
"df = df.drop(labels=['PictureURL'], axis=1)\n",
"blah = blah.apply(lambda x: dict_pics[x])\n",

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"id": "572dc7fb",
"metadata": {},
"outputs": [],
@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 8,
"id": "8d94196d",
"metadata": {},
"outputs": [],
@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"id": "a5c72863",
"metadata": {},
"outputs": [],
@ -79,22 +79,28 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 11,
"id": "1057a442",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{source:target} dictionary created @ /tf/training_images\n"
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'group'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mblah\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPictureURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PictureURL'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'"
]
}
],
"source": [
"def dict_pics():\n",
"def dict_pics_jup(): # \n",
" target_dir = os.getcwd() + os.sep + \"training_images\"\n",
" with open('temp_pics_source_list.txt') as f:\n",
" temp_pics_source_list = json.load(f)\n",
@ -102,7 +108,7 @@
" print(\"{source:target} dictionary created @ \" + target_dir)\n",
" return dict_pics\n",
"\n",
"dict_pics = dict_pics()\n",
"dict_pics = dict_pics_jup()\n",
"blah = pd.Series(df.PictureURL)\n",
"df = df.drop(labels=['PictureURL'], axis=1)\n",
"blah = blah.apply(lambda x: dict_pics[x])\n",

View File

@ -23,6 +23,6 @@ download = input('download images?: ')
if ('y' or 'Y') in download:
with open('temp_pics_source_list.txt') as f:
url_list = json.load(f)
curate.dl_pictures(url_list)
curate.dl_pictures()
else:
pass

View File

@ -379,9 +379,13 @@ class CurateData:
with open('temp_pics_source_list.txt') as f:
tpsl = json.load(f)
tpsl.extend(temp_pics_source_list)
# ensures no duplicate source URLs exist
temp_pics_source_list = list(set(tpsl))
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
# creates file if script is ran for 1st time and file not present
except (ValueError, FileNotFoundError):
with open('temp_pics_source_list.txt', 'w') as f:
json.dump(temp_pics_source_list, f)
@ -392,11 +396,27 @@ class CurateData:
return expanded_class, expanded_dropd
def dl_pictures(self):
'''
Downloads pictures from api to local storage using temp_pics_source_list
and creates custom {source:target} dictionary as dict_pics
'''
def dl_pic(self,dict_pics, pic):
try:
if os.path.exists(dict_pics[pic]):
pass
else:
try:
r = requests.get(pic, stream=True)
r.raw.decode_content = True
with open(dict_pics[pic], 'wb') as f:
shutil.copyfileobj(r.raw, f)
except ConnectionError:
return
except KeyError:
pass
def dict_pics(self):
try:
with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
@ -418,44 +438,45 @@ class CurateData:
try:
temp_pics_source_list = json.load(f)
except (ValueError, FileNotFoundError):
print('url list not found. download aborted')
print('url list not found. aborting')
return
dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
dict_pics = {}
for k in temp_pics_source_list:
if re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE) and re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE) is not None:
tag = re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE).group() + re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE).group()
file_name = target_dir + os.sep + tag
dict_pics.update({k:file_name})
with open('dict_pics.txt', 'w') as f:
json.dump(dict_pics, f)
def dl_pic(dict_pics, pic):
return dict_pics # TODO still need to find sol to outliers (i.e., naming scheme for unusual source URLs)
def dl_pictures(self, *dict_pics):
'''
Downloads pictures from api to local storage using temp_pics_source_list
and creates custom {source:target} dictionary as dict_pics
'''
if not dict_pics:
with open('dict_pics.txt') as f:
dict_pics = json.load(f)
with open('temp_pics_source_list.txt') as f:
try:
if os.path.exists(dict_pics[pic]):
pass # TODO should catch dupes, but make sure it is
else:
try:
r = requests.get(pic, stream=True)
r.raw.decode_content = True
with open(dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work
shutil.copyfileobj(r.raw, f)
except ConnectionError:
return
except KeyError:
pass
temp_pics_source_list = json.load(f)
except (ValueError, FileNotFoundError):
print('url list not found. download aborted')
return
bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(lambda p: dl_pic(*p), bargs):
for future in executor.map(lambda p: self.dl_pic(*p), bargs):
if future is not None:
future
else:
print('connection error')
os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully
class PreProcessing:
'''
Includes methods for pre-processing training set input and labels in the