new dl_pic, dict_pics methods and updated dl_pictures. fixed bugs

2022-01-07 18:28:37 -07:00
parent ae25ab88b6
commit 8c41fe1daf
4 changed files with 68 additions and 41 deletions
--- a/Classifier_VGG19.ipynb
+++ b/Classifier_VGG19.ipynb
@@ -93,7 +93,7 @@
    }
   ],
   "source": [
-    "def dict_pics():\n",
+    "def dict_pics_jup():\n",
    "    target_dir = os.getcwd() + os.sep + \"training_images\"\n",
    "    with open('temp_pics_source_list.txt') as f:\n",
    "        temp_pics_source_list = json.load(f)\n",
@@ -101,7 +101,7 @@
    "    print(\"{source:target} dictionary created @ \" + target_dir)\n",
    "    return dict_pics\n",
    "\n",
-    "dict_pics = dict_pics()\n",
+    "dict_pics = dict_pics_jup()\n",
    "blah = pd.Series(df.PictureURL)\n",
    "df = df.drop(labels=['PictureURL'], axis=1)\n",
    "blah = blah.apply(lambda x: dict_pics[x])\n",
--- a/Classifier_Xception.ipynb
+++ b/Classifier_Xception.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "id": "572dc7fb",
   "metadata": {},
   "outputs": [],
@@ -35,7 +35,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "id": "8d94196d",
   "metadata": {},
   "outputs": [],
@@ -68,7 +68,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
   "id": "a5c72863",
   "metadata": {},
   "outputs": [],
@@ -79,22 +79,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
   "id": "1057a442",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{source:target} dictionary created @ /tf/training_images\n"
+     "ename": "AttributeError",
+     "evalue": "'NoneType' object has no attribute 'group'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0mblah\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPictureURL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PictureURL'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-11-d8afc400d306>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'temp_pics_source_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtemp_pics_source_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mtarget_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.jpg)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.jpg'\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{source:target} dictionary created @ \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtarget_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdict_pics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'group'"
     ]
    }
   ],
   "source": [
-    "def dict_pics():\n",
+    "def dict_pics_jup(): # \n",
    "    target_dir = os.getcwd() + os.sep + \"training_images\"\n",
    "    with open('temp_pics_source_list.txt') as f:\n",
    "        temp_pics_source_list = json.load(f)\n",
@@ -102,7 +108,7 @@
    "    print(\"{source:target} dictionary created @ \" + target_dir)\n",
    "    return dict_pics\n",
    "\n",
-    "dict_pics = dict_pics()\n",
+    "dict_pics = dict_pics_jup()\n",
    "blah = pd.Series(df.PictureURL)\n",
    "df = df.drop(labels=['PictureURL'], axis=1)\n",
    "blah = blah.apply(lambda x: dict_pics[x])\n",
--- a/curate.py
+++ b/curate.py
@@ -23,6 +23,6 @@ download = input('download images?: ')
 if ('y' or 'Y') in download:
    with open('temp_pics_source_list.txt') as f:
        url_list = json.load(f)
-    curate.dl_pictures(url_list)
+    curate.dl_pictures()
 else:
    pass
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -379,9 +379,13 @@ class CurateData:
            with open('temp_pics_source_list.txt') as f:
                tpsl = json.load(f)
                tpsl.extend(temp_pics_source_list)
+
+                # ensures no duplicate source URLs exist
                temp_pics_source_list = list(set(tpsl))
                with open('temp_pics_source_list.txt', 'w') as f:
                    json.dump(temp_pics_source_list, f)
+
+        # creates file if script is ran for 1st time and file not present
        except (ValueError, FileNotFoundError):
            with open('temp_pics_source_list.txt', 'w') as f:
                json.dump(temp_pics_source_list, f)
@@ -392,11 +396,27 @@ class CurateData:

        return expanded_class, expanded_dropd

-    def dl_pictures(self):
-        '''
-        Downloads pictures from api to local storage using temp_pics_source_list
-        and creates custom {source:target} dictionary as dict_pics
-        '''
+    def dl_pic(self,dict_pics, pic):
+
+        try:
+
+            if os.path.exists(dict_pics[pic]):
+                pass
+
+            else:
+                try:
+                    r = requests.get(pic, stream=True)
+                    r.raw.decode_content = True
+                    with open(dict_pics[pic], 'wb') as  f:
+                        shutil.copyfileobj(r.raw, f)
+                except ConnectionError:
+
+                    return
+
+        except KeyError:
+            pass
+
+    def dict_pics(self):

        try:
            with open('target_dirs.txt', 'r+') as f: # TODO you can add option to change directory here, too. Look up how to have optional arguments
@@ -418,44 +438,45 @@ class CurateData:
            try:
                temp_pics_source_list = json.load(f)
            except (ValueError, FileNotFoundError):
-                print('url list not found. download aborted')
+                print('url list not found. aborting')
                return

-        
-        dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
+        dict_pics = {}
+        for k in temp_pics_source_list:
+            if re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE) and re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE) is not None:
+                tag = re.search(r'[^/]+(?=/\$_|.(.jpg|.jpeg|.png))', k, re.IGNORECASE).group() + re.search(r'(.jpg|.jpeg|.png)', k, re.IGNORECASE).group()
+                file_name = target_dir + os.sep + tag
+                dict_pics.update({k:file_name})
+
        with open('dict_pics.txt', 'w') as f:
            json.dump(dict_pics, f)

-        def dl_pic(dict_pics, pic):
+        return dict_pics # TODO still need to find sol to outliers (i.e., naming scheme for unusual source URLs)

+    def dl_pictures(self, *dict_pics):
+        '''
+        Downloads pictures from api to local storage using temp_pics_source_list
+        and creates custom {source:target} dictionary as dict_pics
+        '''
+
+        if not dict_pics:
+            with open('dict_pics.txt') as f:
+                dict_pics = json.load(f)
+        with open('temp_pics_source_list.txt') as f:
            try:
-
-                if os.path.exists(dict_pics[pic]):
-                    pass # TODO should catch dupes, but make sure it is
-
-                else:
-                    try:
-                        r = requests.get(pic, stream=True)
-                        r.raw.decode_content = True
-                        with open(dict_pics[pic], 'wb') as  f: # Or call dict_pics[pic] can work
-                            shutil.copyfileobj(r.raw, f)
-                    except ConnectionError:
-
-                        return
-
-            except KeyError:
-                pass
+                temp_pics_source_list = json.load(f)
+            except (ValueError, FileNotFoundError):
+                print('url list not found. download aborted')
+                return

        bargs = [(dict_pics, pic) for pic in temp_pics_source_list]
        with concurrent.futures.ThreadPoolExecutor() as executor:
-            for future in executor.map(lambda p: dl_pic(*p), bargs):
+            for future in executor.map(lambda p: self.dl_pic(*p), bargs):
                if future is not None:
                    future
                else:
                    print('connection error')

-        os.remove('temp_pics_source_list.txt') # Deletes file after downloads complete successfully
-
 class PreProcessing:
    '''
    Includes methods for pre-processing training set input and labels in the