ebay-ml-lister/Shoe Classifier_VGG16.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "572dc7fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-08-01 23:57:09.348119: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1\n"
     ]
    }
   ],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "from matplotlib.image import imread\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "import json\n",
    "import os\n",
    "import re\n",
    "import tempfile\n",
    "import numpy as np\n",
    "from os.path import exists\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from PIL import ImageFile\n",
    "import sklearn as sk\n",
    "from sklearn.model_selection import train_test_split, StratifiedShuffleSplit\n",
    "import tensorflow as tf\n",
    "import tensorflow.keras\n",
    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
    "from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "# custom modules\n",
    "import image_faults\n",
    "\n",
    "ImageFile.LOAD_TRUNCATED_IMAGES = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6ea418cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_regularization(model, regularizer=tf.keras.regularizers.l2(0.0001)):\n",
    "\n",
    "    if not isinstance(regularizer, tf.keras.regularizers.Regularizer):\n",
    "      print(\"Regularizer must be a subclass of tf.keras.regularizers.Regularizer\")\n",
    "      return model\n",
    "\n",
    "    for layer in model.layers:\n",
    "        for attr in ['kernel_regularizer']:\n",
    "            if hasattr(layer, attr):\n",
    "              setattr(layer, attr, regularizer)\n",
    "\n",
    "    # When we change the layers attributes, the change only happens in the model config file\n",
    "    model_json = model.to_json()\n",
    "\n",
    "    # Save the weights before reloading the model.\n",
    "    tmp_weights_path = os.path.join(tempfile.gettempdir(), 'tmp_weights.h5')\n",
    "    model.save_weights(tmp_weights_path)\n",
    "\n",
    "    # load the model from the config\n",
    "    model = tf.keras.models.model_from_json(model_json)\n",
    "    \n",
    "    # Reload the model weights\n",
    "    model.load_weights(tmp_weights_path, by_name=True)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a5c72863",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_faults.faulty_images() # removes faulty images\n",
    "df = pd.read_csv('expanded_class.csv', index_col=[0], low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1057a442",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def dict_pics_jup():\n",
    "    '''\n",
    "    {source:target} dict used to replace source urls with image location as input\n",
    "    '''\n",
    "    target_dir = os.getcwd() + os.sep + \"training_images\"\n",
    "    with open('temp_pics_source_list.txt') as f:\n",
    "        temp_pics_source_list = json.load(f)\n",
    "        \n",
    "    dict_pics = {}\n",
    "    for k in temp_pics_source_list:\n",
    "         patt_1 = re.search(r'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))', k, re.IGNORECASE)\n",
    "         patt_2 = re.search(r'(\\.jpg|\\.jpeg|\\.png)', k, re.IGNORECASE)\n",
    "         if patt_1 and patt_2 is not None:\n",
    "             tag = patt_1.group() + patt_2.group().lower()\n",
    "             file_name = target_dir + os.sep + tag\n",
    "             dict_pics.update({k:file_name})\n",
    "    print(\"{source:target} dictionary created @ \" + target_dir)\n",
    "    return dict_pics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7a6146e6",
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "expected string or bytes-like object",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-0009b269209e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_pics_jup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'women_cat_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mwomen_cats\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'men_cat_list.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-4-4701772f6383>\u001b[0m in \u001b[0;36mdict_pics_jup\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0mdict_pics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_pics_source_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m          \u001b[0mpatt_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'[^/]+(?=/\\$_|.(\\.jpg|\\.jpeg|\\.png))'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m          \u001b[0mpatt_2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'(\\.jpg|\\.jpeg|\\.png)'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIGNORECASE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m          \u001b[0;32mif\u001b[0m \u001b[0mpatt_1\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mpatt_2\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib/python3.8/re.py\u001b[0m in \u001b[0;36msearch\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m    199\u001b[0m     \"\"\"Scan through string looking for a match to the pattern, returning\n\u001b[1;32m    200\u001b[0m     a Match object, or None if no match was found.\"\"\"\n\u001b[0;32m--> 201\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_compile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    202\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    203\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrepl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: expected string or bytes-like object"
     ]
    }
   ],
   "source": [
    "dict_pics = dict_pics_jup()\n",
    "\n",
    "with open('women_cat_list.txt') as f:\n",
    "    women_cats = json.load(f)\n",
    "with open('men_cat_list.txt') as f:\n",
    "    men_cats = json.load(f)\n",
    "    \n",
    "with open('temp_pics_source_list.txt') as f:\n",
    "    tempics = json.load(f)\n",
    "# list of image urls that did not get named properly which will be removed from the dataframe\n",
    "drop_row_vals = []\n",
    "for pic in tempics:\n",
    "    try:\n",
    "        dict_pics[pic]\n",
    "    except KeyError:\n",
    "        drop_row_vals.append(pic)\n",
    "\n",
    "df['PrimaryCategoryID'] = df['PrimaryCategoryID'].astype(str) # pandas thinks ids are ints\n",
    "ddf = df[df.PictureURL.isin(drop_row_vals)==False] # remove improperly named image files\n",
    "df = ddf[ddf.PrimaryCategoryID.isin(men_cats)==False] # removes rows of womens categories\n",
    "\n",
    "blah = pd.Series(df.PictureURL)\n",
    "df = df.drop(labels=['PictureURL'], axis=1)\n",
    "\n",
    "blah = blah.apply(lambda x: dict_pics[x])\n",
    "df = pd.concat([blah, df],axis=1)\n",
    "df = df.groupby('PrimaryCategoryID').filter(lambda x: len(x)>25) # removes cat outliers\n",
    "\n",
    "df=df.sample(frac=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a3a86a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "undersample = RandomUnderSampler(sampling_strategy='auto')\n",
    "train, y_under = undersample.fit_resample(df, df['PrimaryCategoryID'])\n",
    "print(Counter(train['PrimaryCategoryID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "506aa5cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(train, test_size=0.2, random_state=42)\n",
    "# stratify=train['PrimaryCategoryID']\n",
    "# train['PrimaryCategoryID'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d72eb90",
   "metadata": {},
   "outputs": [],
   "source": [
    "datagen = ImageDataGenerator(rescale=1./255., \n",
    "                             validation_split=.2,\n",
    "                             #samplewise_std_normalization=True,\n",
    "                             #horizontal_flip= True,\n",
    "                             #vertical_flip= True,\n",
    "                             #width_shift_range= 0.2,\n",
    "                             #height_shift_range= 0.2,\n",
    "                             #rotation_range= 90,\n",
    "                             preprocessing_function=tf.keras.applications.vgg16.preprocess_input)\n",
    "train_generator=datagen.flow_from_dataframe(\n",
    "    dataframe=train[:len(train)],\n",
    "    directory='./training_images',\n",
    "    x_col='PictureURL',\n",
    "    y_col='PrimaryCategoryID',\n",
    "    batch_size=32,\n",
    "    seed=42,\n",
    "    shuffle=True,\n",
    "    target_size=(244,244),\n",
    "    subset='training'\n",
    "    )\n",
    "validation_generator=datagen.flow_from_dataframe(\n",
    "    dataframe=train[:len(train)], # is using train right?\n",
    "    directory='./training_images',\n",
    "    x_col='PictureURL',\n",
    "    y_col='PrimaryCategoryID',\n",
    "    batch_size=32,\n",
    "    seed=42,\n",
    "    shuffle=True,\n",
    "    target_size=(244,244),\n",
    "    subset='validation'\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b70f37f",
   "metadata": {},
   "outputs": [],
   "source": [
    "imgs, labels = next(train_generator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ed54bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plotImages(images_arr):\n",
    "    fig, axes = plt.subplots(1, 10, figsize=(20,20))\n",
    "    axes = axes.flatten()\n",
    "    for img, ax in zip( images_arr, axes):\n",
    "        ax.imshow(img)\n",
    "        ax.axis('off')\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85934565",
   "metadata": {},
   "outputs": [],
   "source": [
    "#plotImages(imgs)\n",
    "#print(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6322bcad",
   "metadata": {},
   "outputs": [],
   "source": [
    "physical_devices = tf.config.list_physical_devices('GPU')\n",
    "print(len(physical_devices))\n",
    "tf.config.experimental.set_memory_growth(physical_devices[0], True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b31af79e",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_model = tf.keras.applications.vgg16.VGG16(weights='imagenet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe06f2bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "#model = Sequential()\n",
    "#for layer in base_model.layers[:-1]:\n",
    "#    model.add(layer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d3cc82c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop through layers, add Dropout after layers 'fc1' and 'fc2'\n",
    "updated_model = Sequential()\n",
    "for layer in base_model.layers[:-1]:\n",
    "    updated_model.add(layer)\n",
    "    if layer.name in ['fc1', 'fc2']:\n",
    "        updated_model.add(Dropout(.50))\n",
    "\n",
    "model = updated_model\n",
    "\n",
    "for layer in model.layers:\n",
    "    layer.trainable = True\n",
    "\n",
    "model.add(Dense(units=7, activation='softmax'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c774d787",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#model = add_regularization(model)\n",
    "model.summary()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd5d1246",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cd2ba27",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model.fit(x=train_generator,\n",
    "          steps_per_epoch=len(train_generator),\n",
    "          validation_data=validation_generator,\n",
    "          validation_steps=len(validation_generator),\n",
    "          epochs=30,\n",
    "          verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63f791af",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}