493 lines
17 KiB
Plaintext
493 lines
17 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "572dc7fb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from matplotlib import pyplot as plt\n",
|
|
"from matplotlib.image import imread\n",
|
|
"import pandas as pd\n",
|
|
"from collections import Counter\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import tempfile\n",
|
|
"import numpy as np\n",
|
|
"from os.path import exists\n",
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|
"from PIL import ImageFile\n",
|
|
"import sklearn as sk\n",
|
|
"from sklearn.model_selection import train_test_split, StratifiedShuffleSplit\n",
|
|
"import tensorflow as tf\n",
|
|
"import tensorflow.keras\n",
|
|
"from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
|
|
"from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation\n",
|
|
"from tensorflow.keras.models import Sequential\n",
|
|
"from tensorflow.keras.optimizers import Adam\n",
|
|
"# custom modules\n",
|
|
"import image_faults\n",
|
|
"\n",
|
|
"ImageFile.LOAD_TRUNCATED_IMAGES = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):\n",
|
|
"\n",
|
|
" if not isinstance(regularizer, tf.keras.regularizers.Regularizer):\n",
|
|
" print(\"Regularizer must be a subclass of tf.keras.regularizers.Regularizer\")\n",
|
|
" return model\n",
|
|
"\n",
|
|
" for layer in model.layers:\n",
|
|
" for attr in ['kernel_regularizer']:\n",
|
|
" if hasattr(layer, attr):\n",
|
|
" setattr(layer, attr, regularizer)\n",
|
|
"\n",
|
|
" # When we change the layers attributes, the change only happens in the model config file\n",
|
|
" model_json = model.to_json()\n",
|
|
"\n",
|
|
" # Save the weights before reloading the model.\n",
|
|
" tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')\n",
|
|
" model.save_weights(tmp_weights_path)\n",
|
|
"\n",
|
|
" # load the model from the config\n",
|
|
" model = tf.keras.models.model_from_json(model_json)\n",
|
|
" \n",
|
|
" # Reload the model weights\n",
|
|
" model.load_weights(tmp_weights_path, by_name=True)\n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "a5c72863",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# image_faults.faulty_images() # removes faulty images\n",
|
|
"df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "1057a442",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def dict_pics_jup():\n",
|
|
" '''\n",
|
|
" {source:target} dict used to replace source urls with image location as input\n",
|
|
" '''\n",
|
|
" target_dir = os.getcwd() + os.sep + \"training_images\"\n",
|
|
" with open('temp_pics_source_list.txt') as f:\n",
|
|
" temp_pics_source_list = json.load(f)\n",
|
|
" \n",
|
|
" dict_pics = {}\n",
|
|
" for k in temp_pics_source_list:\n",
|
|
" patt_1 = re.search(r'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))', k, re.IGNORECASE)\n",
|
|
" patt_2 = re.search(r'(\\.jpg|\\.jpeg|\\.png)', k, re.IGNORECASE)\n",
|
|
" if patt_1 and patt_2 is not None:\n",
|
|
" tag = patt_1.group() + patt_2.group().lower()\n",
|
|
" file_name = target_dir + os.sep + tag\n",
|
|
" dict_pics.update({k:file_name})\n",
|
|
" print(\"{source:target} dictionary created @ \" + target_dir)\n",
|
|
" return dict_pics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7a6146e6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{source:target} dictionary created @ /tf/training_images\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"dict_pics = dict_pics_jup()\n",
|
|
"\n",
|
|
"with open('women_cat_list.txt') as f:\n",
|
|
" women_cats = json.load(f)\n",
|
|
"with open('men_cat_list.txt') as f:\n",
|
|
" men_cats = json.load(f)\n",
|
|
" \n",
|
|
"with open('temp_pics_source_list.txt') as f:\n",
|
|
" tempics = json.load(f)\n",
|
|
"# list of image urls that did not get named properly which will be removed from the dataframe\n",
|
|
"drop_row_vals = []\n",
|
|
"for pic in tempics:\n",
|
|
" try:\n",
|
|
" dict_pics[pic]\n",
|
|
" except KeyError:\n",
|
|
" drop_row_vals.append(pic)\n",
|
|
"\n",
|
|
"df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
|
|
"ddf = df[df.PictureURL.isin(drop_row_vals)==False] # remove improperly named image files\n",
|
|
"df = ddf[ddf.PrimaryCategoryID.isin(men_cats)==False] # removes rows of womens categories\n",
|
|
"\n",
|
|
"blah = pd.Series(df.PictureURL)\n",
|
|
"df = df.drop(labels=['PictureURL'], axis=1)\n",
|
|
"\n",
|
|
"blah = blah.apply(lambda x: dict_pics[x])\n",
|
|
"df = pd.concat([blah, df],axis=1)\n",
|
|
"df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n",
|
|
"\n",
|
|
"df=df.sample(frac=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Counter({'11632': 6505, '45333': 6505, '53548': 6505, '53557': 6505, '55793': 6505, '62107': 6505, '95672': 6505})\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"undersample = RandomUnderSampler(sampling_strategy='auto')\n",
|
|
"train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])\n",
|
|
"print(Counter(train['PrimaryCategoryID']))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "506aa5cf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train, test = train_test_split(train, test_size=0.2, random_state=42)\n",
|
|
"# stratify=train['PrimaryCategoryID']\n",
|
|
"# train['PrimaryCategoryID'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "4d72eb90",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Found 29143 validated image filenames belonging to 7 classes.\n",
|
|
"Found 7285 validated image filenames belonging to 7 classes.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"datagen = ImageDataGenerator(rescale=1./255., \n",
|
|
" validation_split=.2,\n",
|
|
" #samplewise_std_normalization=True,\n",
|
|
" #horizontal_flip= True,\n",
|
|
" #vertical_flip= True,\n",
|
|
" #width_shift_range= 0.2,\n",
|
|
" #height_shift_range= 0.2,\n",
|
|
" #rotation_range= 90,\n",
|
|
" preprocessing_function=tf.keras.applications.vgg16.preprocess_input)\n",
|
|
"train_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)],\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(244,244),\n",
|
|
" subset='training'\n",
|
|
" )\n",
|
|
"validation_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)], # is using train right?\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(244,244),\n",
|
|
" subset='validation'\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "7b70f37f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"imgs, labels = next(train_generator)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "1ed54bf5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plotImages(images_arr):\n",
|
|
" fig, axes = plt.subplots(1, 10, figsize=(20,20))\n",
|
|
" axes = axes.flatten()\n",
|
|
" for img, ax in zip( images_arr, axes):\n",
|
|
" ax.imshow(img)\n",
|
|
" ax.axis('off')\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "85934565",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#plotImages(imgs)\n",
|
|
"#print(labels)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "6322bcad",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"physical_devices = tf.config.list_physical_devices('GPU')\n",
|
|
"print(len(physical_devices))\n",
|
|
"tf.config.experimental.set_memory_growth(physical_devices[0], True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "b31af79e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"base_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "fe06f2bf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#model = Sequential()\n",
|
|
"#for layer in base_model.layers[:-1]:\n",
|
|
"# model.add(layer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "7d3cc82c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# loop through layers, add Dropout after layers 'fc1' and 'fc2'\n",
|
|
"updated_model = Sequential()\n",
|
|
"for layer in base_model.layers[:-1]:\n",
|
|
" updated_model.add(layer)\n",
|
|
" if layer.name in ['fc1', 'fc2']:\n",
|
|
" updated_model.add(Dropout(.50))\n",
|
|
"\n",
|
|
"model = updated_model\n",
|
|
"\n",
|
|
"for layer in model.layers:\n",
|
|
" layer.trainable = True\n",
|
|
"\n",
|
|
"model.add(Dense(units=7, activation='softmax'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "c774d787",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model: \"sequential\"\n",
|
|
"_________________________________________________________________\n",
|
|
" Layer (type) Output Shape Param # \n",
|
|
"=================================================================\n",
|
|
" block1_conv1 (Conv2D) (None, 224, 224, 64) 1792 \n",
|
|
" \n",
|
|
" block1_conv2 (Conv2D) (None, 224, 224, 64) 36928 \n",
|
|
" \n",
|
|
" block1_pool (MaxPooling2D) (None, 112, 112, 64) 0 \n",
|
|
" \n",
|
|
" block2_conv1 (Conv2D) (None, 112, 112, 128) 73856 \n",
|
|
" \n",
|
|
" block2_conv2 (Conv2D) (None, 112, 112, 128) 147584 \n",
|
|
" \n",
|
|
" block2_pool (MaxPooling2D) (None, 56, 56, 128) 0 \n",
|
|
" \n",
|
|
" block3_conv1 (Conv2D) (None, 56, 56, 256) 295168 \n",
|
|
" \n",
|
|
" block3_conv2 (Conv2D) (None, 56, 56, 256) 590080 \n",
|
|
" \n",
|
|
" block3_conv3 (Conv2D) (None, 56, 56, 256) 590080 \n",
|
|
" \n",
|
|
" block3_pool (MaxPooling2D) (None, 28, 28, 256) 0 \n",
|
|
" \n",
|
|
" block4_conv1 (Conv2D) (None, 28, 28, 512) 1180160 \n",
|
|
" \n",
|
|
" block4_conv2 (Conv2D) (None, 28, 28, 512) 2359808 \n",
|
|
" \n",
|
|
" block4_conv3 (Conv2D) (None, 28, 28, 512) 2359808 \n",
|
|
" \n",
|
|
" block4_pool (MaxPooling2D) (None, 14, 14, 512) 0 \n",
|
|
" \n",
|
|
" block5_conv1 (Conv2D) (None, 14, 14, 512) 2359808 \n",
|
|
" \n",
|
|
" block5_conv2 (Conv2D) (None, 14, 14, 512) 2359808 \n",
|
|
" \n",
|
|
" block5_conv3 (Conv2D) (None, 14, 14, 512) 2359808 \n",
|
|
" \n",
|
|
" block5_pool (MaxPooling2D) (None, 7, 7, 512) 0 \n",
|
|
" \n",
|
|
" flatten (Flatten) (None, 25088) 0 \n",
|
|
" \n",
|
|
" fc1 (Dense) (None, 4096) 102764544 \n",
|
|
" \n",
|
|
" dropout (Dropout) (None, 4096) 0 \n",
|
|
" \n",
|
|
" fc2 (Dense) (None, 4096) 16781312 \n",
|
|
" \n",
|
|
" dropout_1 (Dropout) (None, 4096) 0 \n",
|
|
" \n",
|
|
" dense (Dense) (None, 7) 28679 \n",
|
|
" \n",
|
|
"=================================================================\n",
|
|
"Total params: 134,289,223\n",
|
|
"Trainable params: 134,289,223\n",
|
|
"Non-trainable params: 0\n",
|
|
"_________________________________________________________________\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#model = add_regularization(model)\n",
|
|
"model.summary()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "fd5d1246",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy',\n",
|
|
" metrics=['accuracy'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9cd2ba27",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/30\n",
|
|
"911/911 [==============================] - 329s 356ms/step - loss: 1.8477 - accuracy: 0.2577 - val_loss: 1.6306 - val_accuracy: 0.3669\n",
|
|
"Epoch 2/30\n",
|
|
"911/911 [==============================] - 322s 353ms/step - loss: 1.4882 - accuracy: 0.4353 - val_loss: 1.4317 - val_accuracy: 0.4784\n",
|
|
"Epoch 3/30\n",
|
|
"911/911 [==============================] - 323s 354ms/step - loss: 1.3046 - accuracy: 0.5158 - val_loss: 1.2747 - val_accuracy: 0.5235\n",
|
|
"Epoch 4/30\n",
|
|
"911/911 [==============================] - 319s 350ms/step - loss: 1.1691 - accuracy: 0.5681 - val_loss: 1.2090 - val_accuracy: 0.5529\n",
|
|
"Epoch 5/30\n",
|
|
"911/911 [==============================] - 317s 348ms/step - loss: 1.0389 - accuracy: 0.6185 - val_loss: 1.1774 - val_accuracy: 0.5682\n",
|
|
"Epoch 6/30\n",
|
|
"911/911 [==============================] - 317s 348ms/step - loss: 0.9125 - accuracy: 0.6656 - val_loss: 1.2237 - val_accuracy: 0.5639\n",
|
|
"Epoch 7/30\n",
|
|
"147/911 [===>..........................] - ETA: 3:39 - loss: 0.7312 - accuracy: 0.7256"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model.fit(x=train_generator,\n",
|
|
" steps_per_epoch=len(train_generator),\n",
|
|
" validation_data=validation_generator,\n",
|
|
" validation_steps=len(validation_generator),\n",
|
|
" epochs=30,\n",
|
|
" verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "63f791af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|