From 2f3df22c4abfbfe18ffa7a66af44eca538366c03 Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 12 Dec 2021 19:22:09 -0700 Subject: [PATCH] added image_faults.py 2 remove faulty images, fixed non-expand PictureURL dfs --- curate.py | 2 +- ebay_api.py | 11 ++++++----- image_faults.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 image_faults.py diff --git a/curate.py b/curate.py index 5be0685..45cbd04 100644 --- a/curate.py +++ b/curate.py @@ -11,7 +11,7 @@ training = curate.to_training(raw_data) # creates raw_df class_training = curate.class_training(training) # creates initial class_training df nvl_training = curate.nvl_training(training) # creates initial nvl_training dropd = curate.drop_nvl_cols(nvl_training) # label mask - +dropd expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col diff --git a/ebay_api.py b/ebay_api.py index 4cef34a..db5954c 100644 --- a/ebay_api.py +++ b/ebay_api.py @@ -58,7 +58,6 @@ class ShoppingApi: try: response = requests.get(url, headers=headers, timeout=4) response.raise_for_status() - return response except requests.exceptions.RequestException: print('connection error') @@ -69,7 +68,7 @@ class ShoppingApi: temp_cat_list = [cat['CategoryID'] for cat in response] cat_list.extend(temp_cat_list) - with open('cat_list1.txt', 'w') as f: + with open('cat_list.txt', 'w') as f: json.dump(cat_list, f) # leaf_list = [node['LeafCategory'] for node in response] @@ -281,9 +280,11 @@ class CurateData: # computate power reqs. So, figure out a way to make a true temp list based on the current call executed else: - class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0]) - expanded_class = class_training - dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0]) + class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) + expanded_class = class_training.dropna() + dropd = dropd.dropna(subset=['PictureURL']) + dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan) + dropd = dropd.dropna(subset=['PictureURL']) expanded_dropd = dropd expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values diff --git a/image_faults.py b/image_faults.py new file mode 100644 index 0000000..e90294c --- /dev/null +++ b/image_faults.py @@ -0,0 +1,28 @@ +import os +import PIL +from pathlib import Path +from PIL import UnidentifiedImageError, Image + +''' +Since PIL is used in keras to open images, you need to identify and remove +faulty images to avoid hiccups in training. When these are removed from their +parent folders, their corresponding row in the dataframe should also be removed. +But because the dataframe is constructed as such: + +''' +def faulty_images(): + path = Path("training_images").rglob("*.jpg") + for img_p in path: + try: + img = PIL.Image.open(img_p) + except PIL.UnidentifiedImageError: + os.remove(img_p) + print(img_p + "Removed") +# remove from folder, dataset(is constructed from the csv files +# ), dict_pics, temp_pics_source_list, +# expanded_dropd, expanded_class. But, remember that if you run curate.py +# again the same faulty images will be recreated since it's still in +# the raw_data.txt file + +if __name__=="__main__": + faulty_images()