added image_faults.py 2 remove faulty images, fixed non-expand PictureURL dfs

This commit is contained in:
scott 2021-12-12 19:22:09 -07:00
parent e766a1ba9d
commit 2f3df22c4a
3 changed files with 35 additions and 6 deletions

View File

@ -11,7 +11,7 @@ training = curate.to_training(raw_data) # creates raw_df
class_training = curate.class_training(training) # creates initial class_training df
nvl_training = curate.nvl_training(training) # creates initial nvl_training
dropd = curate.drop_nvl_cols(nvl_training) # label mask
dropd
expanded_dfs = curate.expand_nvlclass(class_training, dropd) # pulls values out of lists for both dfs
expanded_class = expanded_dfs[0] # TODO still having problems with Unnamed: 0 col

View File

@ -58,7 +58,6 @@ class ShoppingApi:
try:
response = requests.get(url, headers=headers, timeout=4)
response.raise_for_status()
return response
except requests.exceptions.RequestException:
print('connection error')
@ -69,7 +68,7 @@ class ShoppingApi:
temp_cat_list = [cat['CategoryID'] for cat in response]
cat_list.extend(temp_cat_list)
with open('cat_list1.txt', 'w') as f:
with open('cat_list.txt', 'w') as f:
json.dump(cat_list, f)
# leaf_list = [node['LeafCategory'] for node in response]
@ -281,9 +280,11 @@ class CurateData:
# computate power reqs. So, figure out a way to make a true temp list based on the current call executed
else:
class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0])
expanded_class = class_training
dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0])
class_training['PictureURL'] = class_training['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
expanded_class = class_training.dropna()
dropd = dropd.dropna(subset=['PictureURL'])
dropd['PictureURL'] = dropd['PictureURL'].apply(lambda x: x[0] if len(x)>0 else np.nan)
dropd = dropd.dropna(subset=['PictureURL'])
expanded_dropd = dropd
expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values

28
image_faults.py Normal file
View File

@ -0,0 +1,28 @@
import os
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError, Image
'''
Since PIL is used in keras to open images, you need to identify and remove
faulty images to avoid hiccups in training. When these are removed from their
parent folders, their corresponding row in the dataframe should also be removed.
But because the dataframe is constructed as such:
'''
def faulty_images():
path = Path("training_images").rglob("*.jpg")
for img_p in path:
try:
img = PIL.Image.open(img_p)
except PIL.UnidentifiedImageError:
os.remove(img_p)
print(img_p + "Removed")
# remove from folder, dataset(is constructed from the csv files
# ), dict_pics, temp_pics_source_list,
# expanded_dropd, expanded_class. But, remember that if you run curate.py
# again the same faulty images will be recreated since it's still in
# the raw_data.txt file
if __name__=="__main__":
faulty_images()