From 8c41fe1daff33d6b503aa3435b5c7e87b2fc7659 Mon Sep 17 00:00:00 2001 From: scott Date: Fri, 7 Jan 2022 18:28:37 -0700 Subject: [PATCH] new dl_pic, dict_pics methods and updated dl_pictures. fixed bugs --- Shoe Classifier_VGG19.ipynb | 4 +- Shoe Classifier_Xception.ipynb | 26 +++++++----- curate.py | 2 +- ebay_api.py | 77 +++++++++++++++++++++------------- 4 files changed, 68 insertions(+), 41 deletions(-) diff --git a/Shoe Classifier_VGG19.ipynb b/Shoe Classifier_VGG19.ipynb index baac84d..eaee681 100644 --- a/Shoe Classifier_VGG19.ipynb +++ b/Shoe Classifier_VGG19.ipynb @@ -93,7 +93,7 @@ } ], "source": [ - "def dict_pics():\n", + "def dict_pics_jup():\n", " target_dir = os.getcwd() + os.sep + \"training_images\"\n", " with open('temp_pics_source_list.txt') as f:\n", " temp_pics_source_list = json.load(f)\n", @@ -101,7 +101,7 @@ " print(\"{source:target} dictionary created @ \" + target_dir)\n", " return dict_pics\n", "\n", - "dict_pics = dict_pics()\n", + "dict_pics = dict_pics_jup()\n", "blah = pd.Series(df.PictureURL)\n", "df = df.drop(labels=['PictureURL'], axis=1)\n", "blah = blah.apply(lambda x: dict_pics[x])\n", diff --git a/Shoe Classifier_Xception.ipynb b/Shoe Classifier_Xception.ipynb index e14bfbe..ae4eb7f 100644 --- a/Shoe Classifier_Xception.ipynb +++ b/Shoe Classifier_Xception.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "572dc7fb", "metadata": {}, "outputs": [], @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "id": "8d94196d", "metadata": {}, "outputs": [], @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "id": "a5c72863", "metadata": {}, "outputs": [], @@ -79,22 +79,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "id": "1057a442", "metadata": { "scrolled": true }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{source:target} dictionary created @ /tf/training_images\n" + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'group'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mblah\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPictureURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PictureURL'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'" ] } ], "source": [ - "def dict_pics():\n", + "def dict_pics_jup(): # \n", " target_dir = os.getcwd() + os.sep + \"training_images\"\n", " with open('temp_pics_source_list.txt') as f:\n", " temp_pics_source_list = json.load(f)\n", @@ -102,7 +108,7 @@ " print(\"{source:target} dictionary created @ \" + target_dir)\n", " return dict_pics\n", "\n", - "dict_pics = dict_pics()\n", + "dict_pics = dict_pics_jup()\n", "blah = pd.Series(df.PictureURL)\n", "df = df.drop(labels=['PictureURL'], axis=1)\n", "blah = blah.apply(lambda x: dict_pics[x])\n", diff --git a/curate.py b/curate.py index e505cff..5376e64 100644 --- a/curate.py +++ b/curate.py @@ -23,6 +23,6 @@ download = input('download images?: ') if ('y' or 'Y') in download: with open('temp_pics_source_list.txt') as f: url_list = json.load(f) - curate.dl_pictures(url_list) + curate.dl_pictures() else: pass diff --git a/ebay_api.py b/ebay_api.py index da47a63..f33c6cc 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -379,9 +379,13 @@ class CurateData: with open('temp_pics_source_list.txt') as f: tpsl = json.load(f) tpsl.extend(temp_pics_source_list) + + # ensures no duplicate source URLs exist temp_pics_source_list = list(set(tpsl)) with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) + + # creates file if script is ran for 1st time and file not present except (ValueError, FileNotFoundError): with open('temp_pics_source_list.txt', 'w') as f: json.dump(temp_pics_source_list, f) @@ -392,11 +396,27 @@ class CurateData: return expanded_class, expanded_dropd - def dl_pictures(self): - ''' - Downloads pictures from api to local storage using temp_pics_source_list - and creates custom {source:target} dictionary as dict_pics - ''' + def dl_pic(self,dict_pics, pic): + + try: + + if os.path.exists(dict_pics[pic]): + pass + + else: + try: + r = requests.get(pic, stream=True) + r.raw.decode_content = True + with open(dict_pics[pic], 'wb') as f: + shutil.copyfileobj(r.raw, f) + except ConnectionError: + + return + + except KeyError: + pass + + def dict_pics(self): try: with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments @@ -418,44 +438,45 @@ class CurateData: try: temp_pics_source_list = json.load(f) except (ValueError, FileNotFoundError): - print('url list not found. download aborted') + print('url list not found. aborting') return - - dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list} + dict_pics = {} + for k in temp_pics_source_list: + if re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE) and re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE) is not None: + tag = re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE).group() + re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE).group() + file_name = target_dir + os.sep + tag + dict_pics.update({k:file_name}) + with open('dict_pics.txt', 'w') as f: json.dump(dict_pics, f) - def dl_pic(dict_pics, pic): + return dict_pics # TODO still need to find sol to outliers (i.e., naming scheme for unusual source URLs) + def dl_pictures(self, *dict_pics): + ''' + Downloads pictures from api to local storage using temp_pics_source_list + and creates custom {source:target} dictionary as dict_pics + ''' + + if not dict_pics: + with open('dict_pics.txt') as f: + dict_pics = json.load(f) + with open('temp_pics_source_list.txt') as f: try: - - if os.path.exists(dict_pics[pic]): - pass # TODO should catch dupes, but make sure it is - - else: - try: - r = requests.get(pic, stream=True) - r.raw.decode_content = True - with open(dict_pics[pic], 'wb') as f: # Or call dict_pics[pic] can work - shutil.copyfileobj(r.raw, f) - except ConnectionError: - - return - - except KeyError: - pass + temp_pics_source_list = json.load(f) + except (ValueError, FileNotFoundError): + print('url list not found. download aborted') + return bargs = [(dict_pics, pic) for pic in temp_pics_source_list] with concurrent.futures.ThreadPoolExecutor() as executor: - for future in executor.map(lambda p: dl_pic(*p), bargs): + for future in executor.map(lambda p: self.dl_pic(*p), bargs): if future is not None: future else: print('connection error') - os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully - class PreProcessing: ''' Includes methods for pre-processing training set input and labels in the