In [1]:
from matplotlib import pyplot as plt
from matplotlib.image import imread
import pandas as pd
from collections import Counter
import json
import os
import re
import tempfile
import numpy as np
from os.path import exists
from imblearn.under_sampling import RandomUnderSampler
from PIL import ImageFile
import sklearn as sk
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
# custom modules
import image_faults

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):

    if not isinstance(regularizer, tf.keras.regularizers.Regularizer):
      print("Regularizer must be a subclass of tf.keras.regularizers.Regularizer")
      return model

    for layer in model.layers:
        for attr in ['kernel_regularizer']:
            if hasattr(layer, attr):
              setattr(layer, attr, regularizer)

    # When we change the layers attributes, the change only happens in the model config file
    model_json = model.to_json()

    # Save the weights before reloading the model.
    tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')
    model.save_weights(tmp_weights_path)

    # load the model from the config
    model = tf.keras.models.model_from_json(model_json)
    
    # Reload the model weights
    model.load_weights(tmp_weights_path, by_name=True)
    return model

In [3]:
image_faults.faulty_images() # removes faulty images
df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)

In [4]:
def dict_pics_jup():
    '''
    {source:target} dict used to replace source urls with image location as input
    '''
    target_dir = os.getcwd() + os.sep + "training_images"
    with open('temp_pics_source_list.txt') as f:
        temp_pics_source_list = json.load(f)
        
    dict_pics = {}
    for k in temp_pics_source_list:
         patt_1 = re.search(r'[^/]+(?=/\$_|.(\.jpg|\.jpeg|\.png))', k, re.IGNORECASE)
         patt_2 = re.search(r'(\.jpg|\.jpeg|\.png)', k, re.IGNORECASE)
         if patt_1 and patt_2 is not None:
             tag = patt_1.group() + patt_2.group().lower()
             file_name = target_dir + os.sep + tag
             dict_pics.update({k:file_name})
    print("{source:target} dictionary created @ " + target_dir)
    return dict_pics


In [5]:
dict_pics = dict_pics_jup()
with open('temp_pics_source_list.txt') as f:
    tempics = json.load(f)
# list of image urls that did not get named properly which will be removed from the dataframe
drop_row_vals = []
for pic in tempics:
    try:
        dict_pics[pic]
    except KeyError:
        drop_row_vals.append(pic)
        
df = df[df.PictureURL.isin(drop_row_vals)==False]
# TODO drop men's or women's categories here
blah = pd.Series(df.PictureURL)
df = df.drop(labels=['PictureURL'], axis=1)

blah = blah.apply(lambda x: dict_pics[x])
df = pd.concat([blah, df],axis=1)
df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers

{source:target} dictionary created @ /tf/training_images


In [6]:
df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints

df=df.sample(frac=1)
len(drop_row_vals)

17

In [7]:
undersample = RandomUnderSampler(sampling_strategy='auto')
train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])
#print(Counter(train['PrimaryCategoryID']))

In [8]:
train, test = train_test_split(train, test_size=0.1, random_state=42)
# stratify=train['PrimaryCategoryID']
# train['PrimaryCategoryID'].value_counts()

In [9]:
datagen = ImageDataGenerator(rescale=1./255., 
                             validation_split=.2,
                             #samplewise_std_normalization=True,
                             #horizontal_flip= True,
                             #vertical_flip= True,
                             #width_shift_range= 0.2,
                             #height_shift_range= 0.2,
                             #rotation_range= 90,
                             preprocessing_function=tf.keras.applications.xception.preprocess_input)
train_generator=datagen.flow_from_dataframe(
    dataframe=train[:len(train)],
    directory='./training_images',
    x_col='PictureURL',
    y_col='PrimaryCategoryID',
    batch_size=64,
    seed=42,
    shuffle=True,
    target_size=(299,299),
    subset='training'
    )
validation_generator=datagen.flow_from_dataframe(
    dataframe=train[:len(train)], # is using train right?
    directory='./training_images',
    x_col='PictureURL',
    y_col='PrimaryCategoryID',
    batch_size=64,
    seed=42,
    shuffle=True,
    target_size=(299,299),
    subset='validation'
    )



Found 53005 validated image filenames belonging to 13 classes.
Found 13251 validated image filenames belonging to 13 classes.


In [10]:
imgs, labels = next(train_generator)

In [11]:
def plotImages(images_arr):
    fig, axes = plt.subplots(1, 10, figsize=(20,20))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [12]:
#plotImages(imgs)
#print(labels)

In [13]:
physical_devices = tf.config.list_physical_devices('GPU')
print(len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

1


In [14]:
# see https://www.kaggle.com/dmitrypukhov/cnn-with-imagedatagenerator-flow-from-dataframe for train/test/val split 
# example

# may need to either create a test dataset from the original dataset or just download a new one

In [15]:
base_model = tf.keras.applications.xception.Xception(include_top=False, weights='imagenet', pooling='avg')
#base_model.summary()

In [16]:
for layer in base_model.layers:
    layer.trainable = True
    
output = Dense(13, activation='softmax')(base_model.output)
model = tf.keras.Model(base_model.input, output)
#model = add_regularization(model)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 block1_conv1 (Conv2D)          (None, None, None,   864         ['input_1[0][0]']                
                                32)                                                               
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, None, None,   128        ['block1_conv1[0][0]']           
 ation)                         32)                                                           

In [17]:
#model.add(Dropout(.5))
#model.add(Dense(64, activation='softmax'))
# model.add(Dropout(.25))
#model = add_regularization(model)


In [18]:
model.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy',
              metrics=['accuracy'])
# sparse_categorical_crossentropy

In [19]:
model.fit(x=train_generator,
          steps_per_epoch=len(train_generator),
          validation_data=validation_generator,
          validation_steps=len(validation_generator),
          epochs=30,
          verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
172/829 [=====>........................] - ETA: 7:57 - loss: 0.1030 - accuracy: 0.9787

KeyboardInterrupt: 