From 2f3df22c4abfbfe18ffa7a66af44eca538366c03 Mon Sep 17 00:00:00 2001
From: scott <spbeach46@gmail.com>
Date: Sun, 12 Dec 2021 19:22:09 -0700
Subject: [PATCH] added image_faults.py 2 remove faulty images, fixed
 non-expand PictureURL dfs

---
 curate.py       |  2 +-
 ebay_api.py     | 11 ++++++-----
 image_faults.py | 28 ++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 image_faults.py

diff --git a/curate.py b/curate.py
index 5be0685..45cbd04 100644
--- a/curate.py
+++ b/curate.py
@@ -11,7 +11,7 @@ training = curate.to_training(raw_data) # creates raw_df
 class_training = curate.class_training(training) # creates initial class_training df
 nvl_training = curate.nvl_training(training) # creates initial nvl_training
 dropd = curate.drop_nvl_cols(nvl_training)  # label mask
-
+dropd
 expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs
 
 expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col
diff --git a/ebay_api.py b/ebay_api.py
index 4cef34a..db5954c 100644
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -58,7 +58,6 @@ class ShoppingApi:
             try:
                 response = requests.get(url, headers=headers, timeout=4)
                 response.raise_for_status()
-                return response
 
             except requests.exceptions.RequestException:
                 print('connection error')
@@ -69,7 +68,7 @@ class ShoppingApi:
             temp_cat_list = [cat['CategoryID'] for cat in response]
             cat_list.extend(temp_cat_list)
 
-            with open('cat_list1.txt', 'w') as f:
+            with open('cat_list.txt', 'w') as f:
                 json.dump(cat_list, f)
 
             # leaf_list = [node['LeafCategory'] for node in response]
@@ -281,9 +280,11 @@ class CurateData:
             # computate power reqs. So, figure out a way to make a true temp list based on the current call executed
 
         else:
-            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0])
-            expanded_class = class_training
-            dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0])
+            class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
+            expanded_class = class_training.dropna()
+            dropd = dropd.dropna(subset=['PictureURL'])
+            dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
+            dropd = dropd.dropna(subset=['PictureURL'])
             expanded_dropd = dropd
 
             expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
diff --git a/image_faults.py b/image_faults.py
new file mode 100644
index 0000000..e90294c
--- /dev/null
+++ b/image_faults.py
@@ -0,0 +1,28 @@
+import os
+import PIL
+from pathlib import Path
+from PIL import UnidentifiedImageError, Image
+
+'''
+Since PIL is used in keras to open images, you need to identify and remove
+faulty images to avoid hiccups in training. When these are removed from their
+parent folders, their corresponding row in the dataframe should also be removed.
+But because the dataframe is constructed as such:
+    
+'''
+def faulty_images():
+    path = Path("training_images").rglob("*.jpg")
+    for img_p in path:
+        try:
+            img = PIL.Image.open(img_p)
+        except PIL.UnidentifiedImageError:
+                os.remove(img_p)
+                print(img_p + "Removed")
+#             remove from folder, dataset(is constructed from the csv files
+#             ), dict_pics, temp_pics_source_list,
+#             expanded_dropd, expanded_class. But, remember that if you run curate.py
+#             again the same faulty images will be recreated since it's still in
+#             the raw_data.txt file
+
+if __name__=="__main__":
+    faulty_images()