399 lines
16 KiB
Plaintext
399 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "572dc7fb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2022-08-01 23:57:09.348119: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from matplotlib import pyplot as plt\n",
|
|
"from matplotlib.image import imread\n",
|
|
"import pandas as pd\n",
|
|
"from collections import Counter\n",
|
|
"import json\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import tempfile\n",
|
|
"import numpy as np\n",
|
|
"from os.path import exists\n",
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|
"from PIL import ImageFile\n",
|
|
"import sklearn as sk\n",
|
|
"from sklearn.model_selection import train_test_split, StratifiedShuffleSplit\n",
|
|
"import tensorflow as tf\n",
|
|
"import tensorflow.keras\n",
|
|
"from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
|
|
"from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation\n",
|
|
"from tensorflow.keras.models import Sequential\n",
|
|
"from tensorflow.keras.optimizers import Adam\n",
|
|
"# custom modules\n",
|
|
"import image_faults\n",
|
|
"\n",
|
|
"ImageFile.LOAD_TRUNCATED_IMAGES = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "6ea418cc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):\n",
|
|
"\n",
|
|
" if not isinstance(regularizer, tf.keras.regularizers.Regularizer):\n",
|
|
" print(\"Regularizer must be a subclass of tf.keras.regularizers.Regularizer\")\n",
|
|
" return model\n",
|
|
"\n",
|
|
" for layer in model.layers:\n",
|
|
" for attr in ['kernel_regularizer']:\n",
|
|
" if hasattr(layer, attr):\n",
|
|
" setattr(layer, attr, regularizer)\n",
|
|
"\n",
|
|
" # When we change the layers attributes, the change only happens in the model config file\n",
|
|
" model_json = model.to_json()\n",
|
|
"\n",
|
|
" # Save the weights before reloading the model.\n",
|
|
" tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')\n",
|
|
" model.save_weights(tmp_weights_path)\n",
|
|
"\n",
|
|
" # load the model from the config\n",
|
|
" model = tf.keras.models.model_from_json(model_json)\n",
|
|
" \n",
|
|
" # Reload the model weights\n",
|
|
" model.load_weights(tmp_weights_path, by_name=True)\n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "a5c72863",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"image_faults.faulty_images() # removes faulty images\n",
|
|
"df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "1057a442",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def dict_pics_jup():\n",
|
|
" '''\n",
|
|
" {source:target} dict used to replace source urls with image location as input\n",
|
|
" '''\n",
|
|
" target_dir = os.getcwd() + os.sep + \"training_images\"\n",
|
|
" with open('temp_pics_source_list.txt') as f:\n",
|
|
" temp_pics_source_list = json.load(f)\n",
|
|
" \n",
|
|
" dict_pics = {}\n",
|
|
" for k in temp_pics_source_list:\n",
|
|
" patt_1 = re.search(r'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))', k, re.IGNORECASE)\n",
|
|
" patt_2 = re.search(r'(\\.jpg|\\.jpeg|\\.png)', k, re.IGNORECASE)\n",
|
|
" if patt_1 and patt_2 is not None:\n",
|
|
" tag = patt_1.group() + patt_2.group().lower()\n",
|
|
" file_name = target_dir + os.sep + tag\n",
|
|
" dict_pics.update({k:file_name})\n",
|
|
" print(\"{source:target} dictionary created @ \" + target_dir)\n",
|
|
" return dict_pics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7a6146e6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "TypeError",
|
|
"evalue": "expected string or bytes-like object",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[0;32m<ipython-input-5-0009b269209e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'women_cat_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mwomen_cats\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'men_cat_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
"\u001b[0;32m<ipython-input-4-4701772f6383>\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mpatt_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mpatt_2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'(\\.jpg|\\.jpeg|\\.png)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpatt_1\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mpatt_2\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
"\u001b[0;32m/usr/lib/python3.8/re.py\u001b[0m in \u001b[0;36msearch\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m 199\u001b[0m \"\"\"Scan through string looking for a match to the pattern, returning\n\u001b[1;32m 200\u001b[0m a Match object, or None if no match was found.\"\"\"\n\u001b[0;32m--> 201\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_compile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 202\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrepl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
"\u001b[0;31mTypeError\u001b[0m: expected string or bytes-like object"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"dict_pics = dict_pics_jup()\n",
|
|
"\n",
|
|
"with open('women_cat_list.txt') as f:\n",
|
|
" women_cats = json.load(f)\n",
|
|
"with open('men_cat_list.txt') as f:\n",
|
|
" men_cats = json.load(f)\n",
|
|
" \n",
|
|
"with open('temp_pics_source_list.txt') as f:\n",
|
|
" tempics = json.load(f)\n",
|
|
"# list of image urls that did not get named properly which will be removed from the dataframe\n",
|
|
"drop_row_vals = []\n",
|
|
"for pic in tempics:\n",
|
|
" try:\n",
|
|
" dict_pics[pic]\n",
|
|
" except KeyError:\n",
|
|
" drop_row_vals.append(pic)\n",
|
|
"\n",
|
|
"df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
|
|
"ddf = df[df.PictureURL.isin(drop_row_vals)==False] # remove improperly named image files\n",
|
|
"df = ddf[ddf.PrimaryCategoryID.isin(men_cats)==False] # removes rows of womens categories\n",
|
|
"\n",
|
|
"blah = pd.Series(df.PictureURL)\n",
|
|
"df = df.drop(labels=['PictureURL'], axis=1)\n",
|
|
"\n",
|
|
"blah = blah.apply(lambda x: dict_pics[x])\n",
|
|
"df = pd.concat([blah, df],axis=1)\n",
|
|
"df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n",
|
|
"\n",
|
|
"df=df.sample(frac=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8a3a86a1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"undersample = RandomUnderSampler(sampling_strategy='auto')\n",
|
|
"train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])\n",
|
|
"print(Counter(train['PrimaryCategoryID']))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "506aa5cf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train, test = train_test_split(train, test_size=0.2, random_state=42)\n",
|
|
"# stratify=train['PrimaryCategoryID']\n",
|
|
"# train['PrimaryCategoryID'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4d72eb90",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"datagen = ImageDataGenerator(rescale=1./255., \n",
|
|
" validation_split=.2,\n",
|
|
" #samplewise_std_normalization=True,\n",
|
|
" #horizontal_flip= True,\n",
|
|
" #vertical_flip= True,\n",
|
|
" #width_shift_range= 0.2,\n",
|
|
" #height_shift_range= 0.2,\n",
|
|
" #rotation_range= 90,\n",
|
|
" preprocessing_function=tf.keras.applications.vgg16.preprocess_input)\n",
|
|
"train_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)],\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(244,244),\n",
|
|
" subset='training'\n",
|
|
" )\n",
|
|
"validation_generator=datagen.flow_from_dataframe(\n",
|
|
" dataframe=train[:len(train)], # is using train right?\n",
|
|
" directory='./training_images',\n",
|
|
" x_col='PictureURL',\n",
|
|
" y_col='PrimaryCategoryID',\n",
|
|
" batch_size=32,\n",
|
|
" seed=42,\n",
|
|
" shuffle=True,\n",
|
|
" target_size=(244,244),\n",
|
|
" subset='validation'\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7b70f37f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"imgs, labels = next(train_generator)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1ed54bf5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plotImages(images_arr):\n",
|
|
" fig, axes = plt.subplots(1, 10, figsize=(20,20))\n",
|
|
" axes = axes.flatten()\n",
|
|
" for img, ax in zip( images_arr, axes):\n",
|
|
" ax.imshow(img)\n",
|
|
" ax.axis('off')\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "85934565",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#plotImages(imgs)\n",
|
|
"#print(labels)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6322bcad",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"physical_devices = tf.config.list_physical_devices('GPU')\n",
|
|
"print(len(physical_devices))\n",
|
|
"tf.config.experimental.set_memory_growth(physical_devices[0], True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b31af79e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"base_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fe06f2bf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#model = Sequential()\n",
|
|
"#for layer in base_model.layers[:-1]:\n",
|
|
"# model.add(layer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7d3cc82c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# loop through layers, add Dropout after layers 'fc1' and 'fc2'\n",
|
|
"updated_model = Sequential()\n",
|
|
"for layer in base_model.layers[:-1]:\n",
|
|
" updated_model.add(layer)\n",
|
|
" if layer.name in ['fc1', 'fc2']:\n",
|
|
" updated_model.add(Dropout(.50))\n",
|
|
"\n",
|
|
"model = updated_model\n",
|
|
"\n",
|
|
"for layer in model.layers:\n",
|
|
" layer.trainable = True\n",
|
|
"\n",
|
|
"model.add(Dense(units=7, activation='softmax'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c774d787",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#model = add_regularization(model)\n",
|
|
"model.summary()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd5d1246",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy',\n",
|
|
" metrics=['accuracy'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9cd2ba27",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.fit(x=train_generator,\n",
|
|
" steps_per_epoch=len(train_generator),\n",
|
|
" validation_data=validation_generator,\n",
|
|
" validation_steps=len(validation_generator),\n",
|
|
" epochs=30,\n",
|
|
" verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "63f791af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|