In [1]:
from matplotlib import pyplot as plt
from matplotlib.image import imread
import pandas as pd
from collections import Counter
import json
import os
import re
import tempfile
import numpy as np
from os.path import exists
from imblearn.under_sampling import RandomUnderSampler
from PIL import ImageFile
import sklearn as sk
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
# custom modules
import image_faults

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):

 if not isinstance(regularizer, tf.keras.regularizers.Regularizer):
 print("Regularizer must be a subclass of tf.keras.regularizers.Regularizer")
 return model

 for layer in model.layers:
 for attr in ['kernel_regularizer']:
 if hasattr(layer, attr):
 setattr(layer, attr, regularizer)

 # When we change the layers attributes, the change only happens in the model config file
 model_json = model.to_json()

 # Save the weights before reloading the model.
 tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')
 model.save_weights(tmp_weights_path)

 # load the model from the config
 model = tf.keras.models.model_from_json(model_json)
 
 # Reload the model weights
 model.load_weights(tmp_weights_path, by_name=True)
 return model

In [3]:
# image_faults.faulty_images() # removes faulty images
df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)

In [4]:
def dict_pics():
 target_dir = os.getcwd() + os.sep + "training_images"
 with open('temp_pics_source_list.txt') as f:
 temp_pics_source_list = json.load(f)
 dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
 print("{source:target} dictionary created @ " + target_dir)
 return dict_pics

dict_pics = dict_pics()
blah = pd.Series(df.PictureURL)
df = df.drop(labels=['PictureURL'], axis=1)
blah = blah.apply(lambda x: dict_pics[x])
df = pd.concat([blah, df],axis=1)
df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers
# removes non-existent image paths

{source:target} dictionary created @ /tf/training_images


In [5]:
df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints

df=df.sample(frac=1)

In [6]:
undersample = RandomUnderSampler(sampling_strategy='auto')
train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])
# print(Counter(train['PrimaryCategoryID']))

In [7]:
train, test = train_test_split(train, test_size=0.1, random_state=42)
# stratify=train['PrimaryCategoryID']
# train['PrimaryCategoryID'].value_counts()

In [8]:
datagen = ImageDataGenerator(rescale=1./255., 
 validation_split=.2,
 #samplewise_std_normalization=True,
 #horizontal_flip= True,
 #vertical_flip= True,
 #width_shift_range= 0.2,
 #height_shift_range= 0.2,
 #rotation_range= 90,
 preprocessing_function=tf.keras.applications.vgg16.preprocess_input)
train_generator=datagen.flow_from_dataframe(
 dataframe=train[:len(train)],
 directory='./training_images',
 x_col='PictureURL',
 y_col='PrimaryCategoryID',
 batch_size=32,
 seed=42,
 shuffle=True,
 target_size=(224,224),
 subset='training'
 )
validation_generator=datagen.flow_from_dataframe(
 dataframe=train[:len(train)], # is using train right?
 directory='./training_images',
 x_col='PictureURL',
 y_col='PrimaryCategoryID',
 batch_size=32,
 seed=42,
 shuffle=True,
 target_size=(224,224),
 subset='validation'
 )

Found 5110 validated image filenames belonging to 13 classes.
Found 1277 validated image filenames belonging to 13 classes.




In [9]:
imgs, labels = next(train_generator)

In [10]:
def plotImages(images_arr):
 fig, axes = plt.subplots(1, 10, figsize=(20,20))
 axes = axes.flatten()
 for img, ax in zip( images_arr, axes):
 ax.imshow(img)
 ax.axis('off')
 plt.tight_layout()
 plt.show()

In [11]:
#plotImages(imgs)
#print(labels)

In [12]:
physical_devices = tf.config.list_physical_devices('GPU')
print(len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

1


In [13]:
# see https://www.kaggle.com/dmitrypukhov/cnn-with-imagedatagenerator-flow-from-dataframe for train/test/val split 
# example

# may need to either create a test dataset from the original dataset or just download a new one

In [14]:
vgg19_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')


In [15]:
model = Sequential()
for layer in vgg19_model.layers[:-1]:
 model.add(layer)

In [21]:
for layer in model.layers:
 layer.trainable = True

In [22]:
#model.add(Dropout(.5))
#model.add(Dense(64, activation='softmax'))
# model.add(Dropout(.25))
model.add(Dense(units=13, activation='softmax'))

In [23]:
model = add_regularization(model)
#model.summary()


In [24]:
model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy',
 metrics=['accuracy'])
# sparse_categorical_crossentropy

In [25]:
model.fit(x=train_generator,
 steps_per_epoch=len(train_generator),
 validation_data=validation_generator,
 validation_steps=len(validation_generator),
 epochs=30,
 verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


