In [1]:
from matplotlib import pyplot as plt
from matplotlib.image import imread
import pandas as pd
import json
import os
import re
import numpy as np
from os.path import exists
from PIL import ImageFile
#import sklearn as sk
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
# custom modules
import image_faults

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
# image_faults.faulty_images() # removes faulty images
df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)

In [3]:
def dict_pics():
    target_dir = os.getcwd() + os.sep + "training_images"
    with open('temp_pics_source_list.txt') as f:
        temp_pics_source_list = json.load(f)
    dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}
    print("{source:target} dictionary created @ " + target_dir)
    return dict_pics

dict_pics = dict_pics()
blah = pd.Series(df.PictureURL)
df = df.drop(labels=['PictureURL'], axis=1)
blah = blah.apply(lambda x: dict_pics[x])
df = pd.concat([blah, df],axis=1)
df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers
# removes non-existent image paths

{source:target} dictionary created @ /tf/training_images


In [4]:
df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints

df=df.sample(frac=1)

In [5]:
datagen = ImageDataGenerator(rescale=1./255., 
                             validation_split=.3,
                             #featurewise_std_normalization=True,
                             #horizontal_flip= True,
                             #vertical_flip= True,
                             #width_shift_range= 0.2,
                             #height_shift_range= 0.2,
                             #rotation_range= 180,
                             preprocessing_function=tf.keras.applications.vgg16.preprocess_input)
train_generator=datagen.flow_from_dataframe(
    dataframe=df[:len(df)],
    directory='./training_images',
    x_col='PictureURL',
    y_col='PrimaryCategoryID',
    batch_size=32,
    seed=42,
    shuffle=True,
    target_size=(224,224),
    subset='training'
    )
validation_generator=datagen.flow_from_dataframe(
    dataframe=df[:len(df)],
    directory='./training_images',
    x_col='PictureURL',
    y_col='PrimaryCategoryID',
    batch_size=32,
    seed=42,
    shuffle=True,
    target_size=(224,224),
    subset='validation'
    )

Found 6217 validated image filenames belonging to 13 classes.
Found 2664 validated image filenames belonging to 13 classes.




In [6]:
imgs, labels = next(train_generator)

In [7]:
def plotImages(images_arr):
    fig, axes = plt.subplots(1, 10, figsize=(20,20))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [8]:
#plotImages(imgs)
#print(labels)

In [9]:
physical_devices = tf.config.list_physical_devices('GPU')
print(len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

1


In [10]:
# see https://www.kaggle.com/dmitrypukhov/cnn-with-imagedatagenerator-flow-from-dataframe for train/test/val split 
# example

# may need to either create a test dataset from the original dataset or just download a new one

In [11]:
vgg16_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')
#weights='imagenet'

In [12]:
model = Sequential()
for layer in vgg16_model.layers[:-1]:
    model.add(layer)

In [13]:
for layer in model.layers:
    layer.trainable = True

In [14]:
model.add(Dense(units=13, activation='softmax'))

In [15]:
#model.summary()

In [16]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
model.fit(x=train_generator,
          steps_per_epoch=len(train_generator),
          validation_data=validation_generator,
          validation_steps=len(validation_generator),
          epochs=100,
          verbose=2)

Epoch 1/100
195/195 - 83s - loss: 2.5928 - accuracy: 0.1139 - val_loss: 2.3657 - val_accuracy: 0.1674 - 83s/epoch - 426ms/step
Epoch 2/100
195/195 - 77s - loss: 2.1473 - accuracy: 0.2582 - val_loss: 1.9276 - val_accuracy: 0.3281 - 77s/epoch - 394ms/step
Epoch 3/100
195/195 - 78s - loss: 1.7234 - accuracy: 0.3973 - val_loss: 1.6724 - val_accuracy: 0.4050 - 78s/epoch - 400ms/step
Epoch 4/100
195/195 - 78s - loss: 1.4692 - accuracy: 0.4843 - val_loss: 1.5583 - val_accuracy: 0.4662 - 78s/epoch - 402ms/step
Epoch 5/100
195/195 - 79s - loss: 1.2598 - accuracy: 0.5477 - val_loss: 1.5135 - val_accuracy: 0.4944 - 79s/epoch - 403ms/step
Epoch 6/100
195/195 - 79s - loss: 1.0220 - accuracy: 0.6376 - val_loss: 1.5566 - val_accuracy: 0.4962 - 79s/epoch - 404ms/step
Epoch 7/100
195/195 - 78s - loss: 0.8021 - accuracy: 0.7084 - val_loss: 1.7647 - val_accuracy: 0.4711 - 78s/epoch - 398ms/step
Epoch 8/100
195/195 - 78s - loss: 0.5998 - accuracy: 0.7804 - val_loss: 1.8439 - val_accuracy: 0.4869 - 78s/epo

KeyboardInterrupt: 