472 lines
16 KiB
Plaintext
472 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "572dc7fb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from matplotlib import pyplot as plt\n",
|
|
"from matplotlib.image import imread\n",
|
|
"import pandas as pd\n",
|
|
"from collections import Counter\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import tempfile\n",
|
|
"import numpy as np\n",
|
|
"from os.path import exists\n",
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|
"from PIL import ImageFile\n",
|
|
"import sklearn as sk\n",
|
|
"from sklearn.model_selection import train_test_split, StratifiedShuffleSplit\n",
|
|
"import tensorflow as tf\n",
|
|
"import tensorflow.keras\n",
|
|
"from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
|
|
"from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation\n",
|
|
"from tensorflow.keras.models import Sequential\n",
|
|
"from tensorflow.keras.optimizers import Adam\n",
|
|
"# custom modules\n",
|
|
"import image_faults\n",
|
|
"\n",
|
|
"ImageFile.LOAD_TRUNCATED_IMAGES = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):\n",
|
|
"\n",
|
|
" if not isinstance(regularizer, tf.keras.regularizers.Regularizer):\n",
|
|
" print(\"Regularizer must be a subclass of tf.keras.regularizers.Regularizer\")\n",
|
|
" return model\n",
|
|
"\n",
|
|
" for layer in model.layers:\n",
|
|
" for attr in ['kernel_regularizer']:\n",
|
|
" if hasattr(layer, attr):\n",
|
|
" setattr(layer, attr, regularizer)\n",
|
|
"\n",
|
|
" # When we change the layers attributes, the change only happens in the model config file\n",
|
|
" model_json = model.to_json()\n",
|
|
"\n",
|
|
" # Save the weights before reloading the model.\n",
|
|
" tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')\n",
|
|
" model.save_weights(tmp_weights_path)\n",
|
|
"\n",
|
|
" # load the model from the config\n",
|
|
" model = tf.keras.models.model_from_json(model_json)\n",
|
|
" \n",
|
|
" # Reload the model weights\n",
|
|
" model.load_weights(tmp_weights_path, by_name=True)\n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "a5c72863",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# image_faults.faulty_images() # removes faulty images\n",
|
|
"df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "1057a442",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{source:target} dictionary created @ /tf/training_images\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def dict_pics():\n",
|
|
" target_dir = os.getcwd() + os.sep + \"training_images\"\n",
|
|
" with open('temp_pics_source_list.txt') as f:\n",
|
|
" temp_pics_source_list = json.load(f)\n",
|
|
" dict_pics = {k:target_dir + os.sep + re.search(r'[^/]+(?=/\\$_|.jpg)', k, re.IGNORECASE).group() + '.jpg' for k in temp_pics_source_list}\n",
|
|
" print(\"{source:target} dictionary created @ \" + target_dir)\n",
|
|
" return dict_pics\n",
|
|
"\n",
|
|
"dict_pics = dict_pics()\n",
|
|
"blah = pd.Series(df.PictureURL)\n",
|
|
"df = df.drop(labels=['PictureURL'], axis=1)\n",
|
|
"blah = blah.apply(lambda x: dict_pics[x])\n",
|
|
"df = pd.concat([blah, df],axis=1)\n",
|
|
"df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n",
|
|
"# removes non-existent image paths"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7a6146e6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
|
|
"\n",
|
|
"df=df.sample(frac=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"undersample = RandomUnderSampler(sampling_strategy='auto')\n",
|
|
"train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])\n",
|
|
"# print(Counter(train['PrimaryCategoryID']))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "506aa5cf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train, test = train_test_split(train, test_size=0.1, random_state=42)\n",
|
|
"# stratify=train['PrimaryCategoryID']\n",
|
|
"# train['PrimaryCategoryID'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "4d72eb90",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Found 5110 validated image filenames belonging to 13 classes.\n",
|
|
"Found 1277 validated image filenames belonging to 13 classes.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/usr/local/lib/python3.8/dist-packages/keras_preprocessing/image/dataframe_iterator.py:279: UserWarning: Found 1 invalid image filename(s) in x_col=\"PictureURL\". These filename(s) will be ignored.\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"datagen = ImageDataGenerator(rescale=1./255., \n",
|
|
" validation_split=.2,\n",
|
|
" #samplewise_std_normalization=True,\n",
|
|
" #horizontal_flip= True,\n",
|
|
" #vertical_flip= True,\n",
|
|
" #width_shift_range= 0.2,\n",
|
|
" #height_shift_range= 0.2,\n",
|
|
" #rotation_range= 90,\n",
|
|
" preprocessing_function=tf.keras.applications.vgg16.preprocess_input)\n",
|
|
"train_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)],\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(224,224),\n",
|
|
" subset='training'\n",
|
|
" )\n",
|
|
"validation_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)], # is using train right?\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(224,224),\n",
|
|
" subset='validation'\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "7b70f37f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"imgs, labels = next(train_generator)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "1ed54bf5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plotImages(images_arr):\n",
|
|
" fig, axes = plt.subplots(1, 10, figsize=(20,20))\n",
|
|
" axes = axes.flatten()\n",
|
|
" for img, ax in zip( images_arr, axes):\n",
|
|
" ax.imshow(img)\n",
|
|
" ax.axis('off')\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "85934565",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#plotImages(imgs)\n",
|
|
"#print(labels)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "6322bcad",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"physical_devices = tf.config.list_physical_devices('GPU')\n",
|
|
"print(len(physical_devices))\n",
|
|
"tf.config.experimental.set_memory_growth(physical_devices[0], True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "07fd25c6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# see https://www.kaggle.com/dmitrypukhov/cnn-with-imagedatagenerator-flow-from-dataframe for train/test/val split \n",
|
|
"# example\n",
|
|
"\n",
|
|
"# may need to either create a test dataset from the original dataset or just download a new one"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "b31af79e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"vgg19_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "fe06f2bf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = Sequential()\n",
|
|
"for layer in vgg19_model.layers[:-1]:\n",
|
|
" model.add(layer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "7d3cc82c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for layer in model.layers:\n",
|
|
" layer.trainable = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "ea620129",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#model.add(Dropout(.5))\n",
|
|
"#model.add(Dense(64, activation='softmax'))\n",
|
|
"# model.add(Dropout(.25))\n",
|
|
"model.add(Dense(units=13, activation='softmax'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "c774d787",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = add_regularization(model)\n",
|
|
"#model.summary()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "fd5d1246",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy',\n",
|
|
" metrics=['accuracy'])\n",
|
|
"# sparse_categorical_crossentropy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "9cd2ba27",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/30\n",
|
|
"160/160 [==============================] - 59s 360ms/step - loss: 2.7627 - accuracy: 0.1125 - val_loss: 2.7406 - val_accuracy: 0.1237\n",
|
|
"Epoch 2/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.7151 - accuracy: 0.1399 - val_loss: 2.7219 - val_accuracy: 0.1402\n",
|
|
"Epoch 3/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.6875 - accuracy: 0.1566 - val_loss: 2.6897 - val_accuracy: 0.1629\n",
|
|
"Epoch 4/30\n",
|
|
"160/160 [==============================] - 56s 353ms/step - loss: 2.6820 - accuracy: 0.1726 - val_loss: 2.6867 - val_accuracy: 0.1684\n",
|
|
"Epoch 5/30\n",
|
|
"160/160 [==============================] - 57s 355ms/step - loss: 2.6579 - accuracy: 0.1771 - val_loss: 2.6919 - val_accuracy: 0.1558\n",
|
|
"Epoch 6/30\n",
|
|
"160/160 [==============================] - 56s 353ms/step - loss: 2.6361 - accuracy: 0.1994 - val_loss: 2.6813 - val_accuracy: 0.1832\n",
|
|
"Epoch 7/30\n",
|
|
"160/160 [==============================] - 56s 352ms/step - loss: 2.6196 - accuracy: 0.2084 - val_loss: 2.6592 - val_accuracy: 0.1950\n",
|
|
"Epoch 8/30\n",
|
|
"160/160 [==============================] - 57s 353ms/step - loss: 2.6031 - accuracy: 0.2172 - val_loss: 2.6693 - val_accuracy: 0.1770\n",
|
|
"Epoch 9/30\n",
|
|
"160/160 [==============================] - 57s 355ms/step - loss: 2.5878 - accuracy: 0.2274 - val_loss: 2.6543 - val_accuracy: 0.2091\n",
|
|
"Epoch 10/30\n",
|
|
"160/160 [==============================] - 56s 350ms/step - loss: 2.5687 - accuracy: 0.2450 - val_loss: 2.6551 - val_accuracy: 0.1942\n",
|
|
"Epoch 11/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.5543 - accuracy: 0.2568 - val_loss: 2.6591 - val_accuracy: 0.2020\n",
|
|
"Epoch 12/30\n",
|
|
"160/160 [==============================] - 56s 352ms/step - loss: 2.5403 - accuracy: 0.2685 - val_loss: 2.6513 - val_accuracy: 0.1973\n",
|
|
"Epoch 13/30\n",
|
|
"160/160 [==============================] - 56s 352ms/step - loss: 2.5311 - accuracy: 0.2695 - val_loss: 2.6445 - val_accuracy: 0.2060\n",
|
|
"Epoch 14/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.5217 - accuracy: 0.2775 - val_loss: 2.6476 - val_accuracy: 0.2044\n",
|
|
"Epoch 15/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.5147 - accuracy: 0.2830 - val_loss: 2.6419 - val_accuracy: 0.2036\n",
|
|
"Epoch 16/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.5084 - accuracy: 0.2851 - val_loss: 2.6396 - val_accuracy: 0.2200\n",
|
|
"Epoch 17/30\n",
|
|
"160/160 [==============================] - 56s 348ms/step - loss: 2.5025 - accuracy: 0.2879 - val_loss: 2.6463 - val_accuracy: 0.2302\n",
|
|
"Epoch 18/30\n",
|
|
"160/160 [==============================] - 56s 350ms/step - loss: 2.4971 - accuracy: 0.2918 - val_loss: 2.6346 - val_accuracy: 0.2208\n",
|
|
"Epoch 19/30\n",
|
|
"160/160 [==============================] - 56s 353ms/step - loss: 2.4924 - accuracy: 0.2967 - val_loss: 2.6366 - val_accuracy: 0.2208\n",
|
|
"Epoch 20/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4882 - accuracy: 0.2988 - val_loss: 2.6317 - val_accuracy: 0.2271\n",
|
|
"Epoch 21/30\n",
|
|
"160/160 [==============================] - 56s 349ms/step - loss: 2.4854 - accuracy: 0.3004 - val_loss: 2.6431 - val_accuracy: 0.2240\n",
|
|
"Epoch 22/30\n",
|
|
"160/160 [==============================] - 56s 352ms/step - loss: 2.4784 - accuracy: 0.3068 - val_loss: 2.6345 - val_accuracy: 0.2114\n",
|
|
"Epoch 23/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4722 - accuracy: 0.3106 - val_loss: 2.6276 - val_accuracy: 0.2294\n",
|
|
"Epoch 24/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4687 - accuracy: 0.3100 - val_loss: 2.6383 - val_accuracy: 0.2177\n",
|
|
"Epoch 25/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4649 - accuracy: 0.3108 - val_loss: 2.6322 - val_accuracy: 0.2122\n",
|
|
"Epoch 26/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4644 - accuracy: 0.3141 - val_loss: 2.6243 - val_accuracy: 0.2247\n",
|
|
"Epoch 27/30\n",
|
|
"160/160 [==============================] - 56s 352ms/step - loss: 2.4599 - accuracy: 0.3188 - val_loss: 2.6332 - val_accuracy: 0.2138\n",
|
|
"Epoch 28/30\n",
|
|
"160/160 [==============================] - 57s 353ms/step - loss: 2.4550 - accuracy: 0.3229 - val_loss: 2.6287 - val_accuracy: 0.2232\n",
|
|
"Epoch 29/30\n",
|
|
"160/160 [==============================] - 57s 354ms/step - loss: 2.4502 - accuracy: 0.3217 - val_loss: 2.6216 - val_accuracy: 0.2287\n",
|
|
"Epoch 30/30\n",
|
|
"160/160 [==============================] - 56s 351ms/step - loss: 2.4506 - accuracy: 0.3190 - val_loss: 2.6329 - val_accuracy: 0.1793\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<keras.callbacks.History at 0x7f7803569ac0>"
|
|
]
|
|
},
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"model.fit(x=train_generator,\n",
|
|
" steps_per_epoch=len(train_generator),\n",
|
|
" validation_data=validation_generator,\n",
|
|
" validation_steps=len(validation_generator),\n",
|
|
" epochs=30,\n",
|
|
" verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "63f791af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|