From 301f4eccde4ddf51b58d6514a9422155a701f864 Mon Sep 17 00:00:00 2001 From: JJ Date: Thu, 9 Feb 2023 05:54:51 +0900 Subject: [PATCH] overfit training examples with 3-layer net and 5-layer net & done inline question 1 #1 - the sensitivity of the network --- .../assignment2/FullyConnectedNets.ipynb | 748 ++++++++++++++++++ 1 file changed, 748 insertions(+) create mode 100644 cs231n_2022/assignment2/FullyConnectedNets.ipynb diff --git a/cs231n_2022/assignment2/FullyConnectedNets.ipynb b/cs231n_2022/assignment2/FullyConnectedNets.ipynb new file mode 100644 index 0000000..8ac753b --- /dev/null +++ b/cs231n_2022/assignment2/FullyConnectedNets.ipynb @@ -0,0 +1,748 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d:\\Assignments\\ML_DL\\AI_git\\cs231n_2022\\assignment2\\cs231n\\datasets\n", + "d:\\Assignments\\ML_DL\\AI_git\\cs231n_2022\\assignment2\n" + ] + } + ], + "source": [ + "# This downloads the CIFAR-10 dataset to your Drive\n", + "# if it doesn't already exist.\n", + "%cd cs231n/datasets/\n", + "#!bash get_datasets.sh\n", + "!bash get_datasets_windows.sh\n", + "%cd ../../" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multi-Layer Fully Connected Network\n", + "In this exercise, you will implement a fully connected network with an arbitrary number of hidden layers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read through the `FullyConnectedNet` class in the file `cs231n/classifiers/fc_net.py`.\n", + "\n", + "Implement the network initialization, forward pass, and backward pass. Throughout this assignment, you will be implementing layers in `cs231n/layers.py`. You can re-use your implementations for `affine_forward`, `affine_backward`, `relu_forward`, `relu_backward`, and `softmax_loss` from Assignment 1. For right now, don't worry about implementing dropout or batch/layer normalization yet, as you will add those features later.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "pdf-ignore" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========\n", + "\tYou will need to compile a Cython extension for a portion of this assignment.\n", + "\tThe instructions to do this will be given in a section of the notebook below.\n" + ] + } + ], + "source": [ + "# Setup cell.\n", + "import time\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from cs231n.classifiers.fc_net import *\n", + "from cs231n.data_utils import get_CIFAR10_data\n", + "from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array\n", + "from cs231n.solver import Solver\n", + "\n", + "%matplotlib inline\n", + "plt.rcParams[\"figure.figsize\"] = (10.0, 8.0) # Set default size of plots.\n", + "plt.rcParams[\"image.interpolation\"] = \"nearest\"\n", + "plt.rcParams[\"image.cmap\"] = \"gray\"\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "def rel_error(x, y):\n", + " \"\"\"Returns relative error.\"\"\"\n", + " return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train: (49000, 3, 32, 32)\n", + "y_train: (49000,)\n", + "X_val: (1000, 3, 32, 32)\n", + "y_val: (1000,)\n", + "X_test: (1000, 3, 32, 32)\n", + "y_test: (1000,)\n" + ] + } + ], + "source": [ + "# Load the (preprocessed) CIFAR-10 data.\n", + "data = get_CIFAR10_data()\n", + "for k, v in list(data.items()):\n", + " print(f\"{k}: {v.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial Loss and Gradient Check\n", + "\n", + "As a sanity check, run the following to check the initial loss and to gradient check the network both with and without regularization. This is a good way to see if the initial losses seem reasonable.\n", + "\n", + "For gradient checking, you should expect to see errors around 1e-7 or less." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running check with reg = 0\n", + "Initial loss: 2.3004790897684924\n", + "W1 relative error: 7.696803870986541e-08\n", + "W2 relative error: 1.7087519140575808e-05\n", + "W3 relative error: 2.9508423118300657e-07\n", + "b1 relative error: 4.660094650186831e-09\n", + "b2 relative error: 2.085654124402131e-09\n", + "b3 relative error: 6.598642296022133e-11\n", + "Running check with reg = 3.14\n", + "Initial loss: 7.052114776533016\n", + "W1 relative error: 3.904542008453064e-09\n", + "W2 relative error: 6.86942277940646e-08\n", + "W3 relative error: 2.1311298702113723e-08\n", + "b1 relative error: 1.1683196894962977e-08\n", + "b2 relative error: 1.7223751746766738e-09\n", + "b3 relative error: 1.3200479211447775e-10\n" + ] + } + ], + "source": [ + "np.random.seed(231)\n", + "N, D, H1, H2, C = 2, 15, 20, 30, 10\n", + "X = np.random.randn(N, D)\n", + "y = np.random.randint(C, size=(N,))\n", + "\n", + "for reg in [0, 3.14]:\n", + " print(\"Running check with reg = \", reg)\n", + " model = FullyConnectedNet(\n", + " [H1, H2],\n", + " input_dim=D,\n", + " num_classes=C,\n", + " reg=reg,\n", + " weight_scale=5e-2,\n", + " dtype=np.float64\n", + " )\n", + "\n", + " loss, grads = model.loss(X, y)\n", + " print(\"Initial loss: \", loss)\n", + "\n", + " # Most of the errors should be on the order of e-7 or smaller. \n", + " # NOTE: It is fine however to see an error for W2 on the order of e-5\n", + " # for the check when reg = 0.0\n", + " for name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n", + " print(f\"{name} relative error: {rel_error(grad_num, grads[name])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As another sanity check, make sure your network can overfit on a small dataset of 50 images. First, we will try a three-layer network with 100 units in each hidden layer. In the following cell, tweak the **learning rate** and **weight initialization scale** to overfit and achieve 100% training accuracy within 20 epochs." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Iteration 1 / 40) loss: 2.385904\n", + "(Epoch 0 / 20) train acc: 0.260000; val_acc: 0.112000\n", + "(Epoch 1 / 20) train acc: 0.380000; val_acc: 0.113000\n", + "(Epoch 2 / 20) train acc: 0.480000; val_acc: 0.109000\n", + "(Epoch 3 / 20) train acc: 0.680000; val_acc: 0.157000\n", + "(Epoch 4 / 20) train acc: 0.600000; val_acc: 0.123000\n", + "(Epoch 5 / 20) train acc: 0.700000; val_acc: 0.144000\n", + "(Iteration 11 / 40) loss: 0.986071\n", + "(Epoch 6 / 20) train acc: 0.740000; val_acc: 0.153000\n", + "(Epoch 7 / 20) train acc: 0.820000; val_acc: 0.168000\n", + "(Epoch 8 / 20) train acc: 0.900000; val_acc: 0.172000\n", + "(Epoch 9 / 20) train acc: 0.920000; val_acc: 0.187000\n", + "(Epoch 10 / 20) train acc: 0.900000; val_acc: 0.160000\n", + "(Iteration 21 / 40) loss: 0.258918\n", + "(Epoch 11 / 20) train acc: 0.980000; val_acc: 0.171000\n", + "(Epoch 12 / 20) train acc: 0.920000; val_acc: 0.182000\n", + "(Epoch 13 / 20) train acc: 0.980000; val_acc: 0.189000\n", + "(Epoch 14 / 20) train acc: 0.960000; val_acc: 0.194000\n", + "(Epoch 15 / 20) train acc: 1.000000; val_acc: 0.177000\n", + "(Iteration 31 / 40) loss: 0.117595\n", + "(Epoch 16 / 20) train acc: 1.000000; val_acc: 0.174000\n", + "(Epoch 17 / 20) train acc: 1.000000; val_acc: 0.188000\n", + "(Epoch 18 / 20) train acc: 1.000000; val_acc: 0.189000\n", + "(Epoch 19 / 20) train acc: 1.000000; val_acc: 0.181000\n", + "(Epoch 20 / 20) train acc: 1.000000; val_acc: 0.182000\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# TODO: Use a three-layer Net to overfit 50 training examples by \n", + "# tweaking just the learning rate and initialization scale.\n", + "\n", + "num_train = 50\n", + "small_data = {\n", + " \"X_train\": data[\"X_train\"][:num_train],\n", + " \"y_train\": data[\"y_train\"][:num_train],\n", + " \"X_val\": data[\"X_val\"],\n", + " \"y_val\": data[\"y_val\"],\n", + "}\n", + "\n", + "weight_scale = 1e-2 # Experiment with this!\n", + "learning_rate = 1e-2 # Experiment with this!\n", + "model = FullyConnectedNet(\n", + " [100, 100],\n", + " weight_scale=weight_scale,\n", + " dtype=np.float64\n", + ")\n", + "solver = Solver(\n", + " model,\n", + " small_data,\n", + " print_every=10,\n", + " num_epochs=20,\n", + " batch_size=25,\n", + " update_rule=\"sgd\",\n", + " optim_config={\"learning_rate\": learning_rate},\n", + ")\n", + "solver.train()\n", + "\n", + "plt.plot(solver.loss_history)\n", + "plt.title(\"Training loss history\")\n", + "plt.xlabel(\"Iteration\")\n", + "plt.ylabel(\"Training loss\")\n", + "plt.grid(linestyle='--', linewidth=0.5)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, try to use a five-layer network with 100 units on each layer to overfit on 50 training examples. Again, you will have to adjust the learning rate and weight initialization scale, but you should be able to achieve 100% training accuracy within 20 epochs." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Iteration 1 / 40) loss: 4.783901\n", + "(Epoch 0 / 20) train acc: 0.220000; val_acc: 0.114000\n", + "(Epoch 1 / 20) train acc: 0.400000; val_acc: 0.114000\n", + "(Epoch 2 / 20) train acc: 0.460000; val_acc: 0.131000\n", + "(Epoch 3 / 20) train acc: 0.700000; val_acc: 0.115000\n", + "(Epoch 4 / 20) train acc: 0.740000; val_acc: 0.149000\n", + "(Epoch 5 / 20) train acc: 0.880000; val_acc: 0.125000\n", + "(Iteration 11 / 40) loss: 0.549426\n", + "(Epoch 6 / 20) train acc: 0.920000; val_acc: 0.126000\n", + "(Epoch 7 / 20) train acc: 0.940000; val_acc: 0.136000\n", + "(Epoch 8 / 20) train acc: 0.940000; val_acc: 0.146000\n", + "(Epoch 9 / 20) train acc: 0.960000; val_acc: 0.142000\n", + "(Epoch 10 / 20) train acc: 0.980000; val_acc: 0.134000\n", + "(Iteration 21 / 40) loss: 0.289741\n", + "(Epoch 11 / 20) train acc: 0.980000; val_acc: 0.144000\n", + "(Epoch 12 / 20) train acc: 1.000000; val_acc: 0.139000\n", + "(Epoch 13 / 20) train acc: 1.000000; val_acc: 0.135000\n", + "(Epoch 14 / 20) train acc: 1.000000; val_acc: 0.139000\n", + "(Epoch 15 / 20) train acc: 1.000000; val_acc: 0.140000\n", + "(Iteration 31 / 40) loss: 0.113694\n", + "(Epoch 16 / 20) train acc: 1.000000; val_acc: 0.140000\n", + "(Epoch 17 / 20) train acc: 1.000000; val_acc: 0.138000\n", + "(Epoch 18 / 20) train acc: 1.000000; val_acc: 0.136000\n", + "(Epoch 19 / 20) train acc: 1.000000; val_acc: 0.145000\n", + "(Epoch 20 / 20) train acc: 1.000000; val_acc: 0.142000\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# TODO: Use a five-layer Net to overfit 50 training examples by \n", + "# tweaking just the learning rate and initialization scale.\n", + "\n", + "num_train = 50\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "learning_rate = 5e-3 # Experiment with this!\n", + "weight_scale = 5e-2 # Experiment with this!\n", + "model = FullyConnectedNet(\n", + " [100, 100, 100, 100],\n", + " weight_scale=weight_scale,\n", + " dtype=np.float64\n", + ")\n", + "solver = Solver(\n", + " model,\n", + " small_data,\n", + " print_every=10,\n", + " num_epochs=20,\n", + " batch_size=25,\n", + " update_rule='sgd',\n", + " optim_config={'learning_rate': learning_rate},\n", + ")\n", + "solver.train()\n", + "\n", + "plt.plot(solver.loss_history)\n", + "plt.title('Training loss history')\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Training loss')\n", + "plt.grid(linestyle='--', linewidth=0.5)\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "tags": [ + "pdf-inline" + ] + }, + "source": [ + "## Inline Question 1: \n", + "Did you notice anything about the comparative difficulty of training the three-layer network vs. training the five-layer network? In particular, based on your experience, which network seemed more sensitive to the initialization scale? Why do you think that is the case?\n", + "\n", + "## Answer:\n", + "[5 layer net이 3 layer net보다 weight initaliation scale에 더 sensitive한 것으로 보인다. 네트워크가 깊어질수록, 작은 weight scale은 vanishing gradients/큰 weight scale은 exploding gradients 문제가 더욱 많이 발생할 것으로 예상된다.]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Update rules\n", + "So far we have used vanilla stochastic gradient descent (SGD) as our update rule. More sophisticated update rules can make it easier to train deep networks. We will implement a few of the most commonly used update rules and compare them to vanilla SGD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SGD+Momentum\n", + "Stochastic gradient descent with momentum is a widely used update rule that tends to make deep networks converge faster than vanilla stochastic gradient descent. See the Momentum Update section at http://cs231n.github.io/neural-networks-3/#sgd for more information.\n", + "\n", + "Open the file `cs231n/optim.py` and read the documentation at the top of the file to make sure you understand the API. Implement the SGD+momentum update rule in the function `sgd_momentum` and run the following to check your implementation. You should see errors less than e-8." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cs231n.optim import sgd_momentum\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "v = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "\n", + "config = {\"learning_rate\": 1e-3, \"velocity\": v}\n", + "next_w, _ = sgd_momentum(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [ 0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789],\n", + " [ 0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526],\n", + " [ 0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263],\n", + " [ 1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096 ]])\n", + "expected_velocity = np.asarray([\n", + " [ 0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158],\n", + " [ 0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105],\n", + " [ 0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053],\n", + " [ 0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096 ]])\n", + "\n", + "# Should see relative errors around e-8 or less\n", + "print(\"next_w error: \", rel_error(next_w, expected_next_w))\n", + "print(\"velocity error: \", rel_error(expected_velocity, config[\"velocity\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have done so, run the following to train a six-layer network with both SGD and SGD+momentum. You should see the SGD+momentum update rule converge faster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "num_train = 4000\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "solvers = {}\n", + "\n", + "for update_rule in ['sgd', 'sgd_momentum']:\n", + " print('Running with ', update_rule)\n", + " model = FullyConnectedNet(\n", + " [100, 100, 100, 100, 100],\n", + " weight_scale=5e-2\n", + " )\n", + "\n", + " solver = Solver(\n", + " model,\n", + " small_data,\n", + " num_epochs=5,\n", + " batch_size=100,\n", + " update_rule=update_rule,\n", + " optim_config={'learning_rate': 5e-3},\n", + " verbose=True,\n", + " )\n", + " solvers[update_rule] = solver\n", + " solver.train()\n", + "\n", + "fig, axes = plt.subplots(3, 1, figsize=(15, 15))\n", + "\n", + "axes[0].set_title('Training loss')\n", + "axes[0].set_xlabel('Iteration')\n", + "axes[1].set_title('Training accuracy')\n", + "axes[1].set_xlabel('Epoch')\n", + "axes[2].set_title('Validation accuracy')\n", + "axes[2].set_xlabel('Epoch')\n", + "\n", + "for update_rule, solver in solvers.items():\n", + " axes[0].plot(solver.loss_history, label=f\"loss_{update_rule}\")\n", + " axes[1].plot(solver.train_acc_history, label=f\"train_acc_{update_rule}\")\n", + " axes[2].plot(solver.val_acc_history, label=f\"val_acc_{update_rule}\")\n", + " \n", + "for ax in axes:\n", + " ax.legend(loc=\"best\", ncol=4)\n", + " ax.grid(linestyle='--', linewidth=0.5)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RMSProp and Adam\n", + "RMSProp [1] and Adam [2] are update rules that set per-parameter learning rates by using a running average of the second moments of gradients.\n", + "\n", + "In the file `cs231n/optim.py`, implement the RMSProp update rule in the `rmsprop` function and implement the Adam update rule in the `adam` function, and check your implementations using the tests below.\n", + "\n", + "**NOTE:** Please implement the _complete_ Adam update rule (with the bias correction mechanism), not the first simplified version mentioned in the course notes. \n", + "\n", + "[1] Tijmen Tieleman and Geoffrey Hinton. \"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude.\" COURSERA: Neural Networks for Machine Learning 4 (2012).\n", + "\n", + "[2] Diederik Kingma and Jimmy Ba, \"Adam: A Method for Stochastic Optimization\", ICLR 2015." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test RMSProp implementation\n", + "from cs231n.optim import rmsprop\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "cache = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "\n", + "config = {'learning_rate': 1e-2, 'cache': cache}\n", + "next_w, _ = rmsprop(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [-0.39223849, -0.34037513, -0.28849239, -0.23659121, -0.18467247],\n", + " [-0.132737, -0.08078555, -0.02881884, 0.02316247, 0.07515774],\n", + " [ 0.12716641, 0.17918792, 0.23122175, 0.28326742, 0.33532447],\n", + " [ 0.38739248, 0.43947102, 0.49155973, 0.54365823, 0.59576619]])\n", + "expected_cache = np.asarray([\n", + " [ 0.5976, 0.6126277, 0.6277108, 0.64284931, 0.65804321],\n", + " [ 0.67329252, 0.68859723, 0.70395734, 0.71937285, 0.73484377],\n", + " [ 0.75037008, 0.7659518, 0.78158892, 0.79728144, 0.81302936],\n", + " [ 0.82883269, 0.84469141, 0.86060554, 0.87657507, 0.8926 ]])\n", + "\n", + "# You should see relative errors around e-7 or less\n", + "print('next_w error: ', rel_error(expected_next_w, next_w))\n", + "print('cache error: ', rel_error(expected_cache, config['cache']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test Adam implementation\n", + "from cs231n.optim import adam\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "m = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "v = np.linspace(0.7, 0.5, num=N*D).reshape(N, D)\n", + "\n", + "config = {'learning_rate': 1e-2, 'm': m, 'v': v, 't': 5}\n", + "next_w, _ = adam(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [-0.40094747, -0.34836187, -0.29577703, -0.24319299, -0.19060977],\n", + " [-0.1380274, -0.08544591, -0.03286534, 0.01971428, 0.0722929],\n", + " [ 0.1248705, 0.17744702, 0.23002243, 0.28259667, 0.33516969],\n", + " [ 0.38774145, 0.44031188, 0.49288093, 0.54544852, 0.59801459]])\n", + "expected_v = np.asarray([\n", + " [ 0.69966, 0.68908382, 0.67851319, 0.66794809, 0.65738853,],\n", + " [ 0.64683452, 0.63628604, 0.6257431, 0.61520571, 0.60467385,],\n", + " [ 0.59414753, 0.58362676, 0.57311152, 0.56260183, 0.55209767,],\n", + " [ 0.54159906, 0.53110598, 0.52061845, 0.51013645, 0.49966, ]])\n", + "expected_m = np.asarray([\n", + " [ 0.48, 0.49947368, 0.51894737, 0.53842105, 0.55789474],\n", + " [ 0.57736842, 0.59684211, 0.61631579, 0.63578947, 0.65526316],\n", + " [ 0.67473684, 0.69421053, 0.71368421, 0.73315789, 0.75263158],\n", + " [ 0.77210526, 0.79157895, 0.81105263, 0.83052632, 0.85 ]])\n", + "\n", + "# You should see relative errors around e-7 or less\n", + "print('next_w error: ', rel_error(expected_next_w, next_w))\n", + "print('v error: ', rel_error(expected_v, config['v']))\n", + "print('m error: ', rel_error(expected_m, config['m']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have debugged your RMSProp and Adam implementations, run the following to train a pair of deep networks using these new update rules:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "learning_rates = {'rmsprop': 1e-4, 'adam': 1e-3}\n", + "for update_rule in ['adam', 'rmsprop']:\n", + " print('Running with ', update_rule)\n", + " model = FullyConnectedNet(\n", + " [100, 100, 100, 100, 100],\n", + " weight_scale=5e-2\n", + " )\n", + " solver = Solver(\n", + " model,\n", + " small_data,\n", + " num_epochs=5,\n", + " batch_size=100,\n", + " update_rule=update_rule,\n", + " optim_config={'learning_rate': learning_rates[update_rule]},\n", + " verbose=True\n", + " )\n", + " solvers[update_rule] = solver\n", + " solver.train()\n", + " print()\n", + " \n", + "fig, axes = plt.subplots(3, 1, figsize=(15, 15))\n", + "\n", + "axes[0].set_title('Training loss')\n", + "axes[0].set_xlabel('Iteration')\n", + "axes[1].set_title('Training accuracy')\n", + "axes[1].set_xlabel('Epoch')\n", + "axes[2].set_title('Validation accuracy')\n", + "axes[2].set_xlabel('Epoch')\n", + "\n", + "for update_rule, solver in solvers.items():\n", + " axes[0].plot(solver.loss_history, label=f\"{update_rule}\")\n", + " axes[1].plot(solver.train_acc_history, label=f\"{update_rule}\")\n", + " axes[2].plot(solver.val_acc_history, label=f\"{update_rule}\")\n", + " \n", + "for ax in axes:\n", + " ax.legend(loc='best', ncol=4)\n", + " ax.grid(linestyle='--', linewidth=0.5)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "pdf-inline" + ] + }, + "source": [ + "## Inline Question 2:\n", + "\n", + "AdaGrad, like Adam, is a per-parameter optimization method that uses the following update rule:\n", + "\n", + "```\n", + "cache += dw**2\n", + "w += - learning_rate * dw / (np.sqrt(cache) + eps)\n", + "```\n", + "\n", + "John notices that when he was training a network with AdaGrad that the updates became very small, and that his network was learning slowly. Using your knowledge of the AdaGrad update rule, why do you think the updates would become very small? Would Adam have the same issue?\n", + "\n", + "\n", + "## Answer: \n", + "[FILL THIS IN]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train a Good Model!\n", + "Train the best fully connected model that you can on CIFAR-10, storing your best model in the `best_model` variable. We require you to get at least 50% accuracy on the validation set using a fully connected network.\n", + "\n", + "If you are careful it should be possible to get accuracies above 55%, but we don't require it for this part and won't assign extra credit for doing so. Later in the assignment we will ask you to train the best convolutional network that you can on CIFAR-10, and we would prefer that you spend your effort working on convolutional networks rather than fully connected networks.\n", + "\n", + "**Note:** You might find it useful to complete the `BatchNormalization.ipynb` and `Dropout.ipynb` notebooks before completing this part, since those techniques can help you train powerful models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "best_model = None\n", + "\n", + "################################################################################\n", + "# TODO: Train the best FullyConnectedNet that you can on CIFAR-10. You might #\n", + "# find batch/layer normalization and dropout useful. Store your best model in #\n", + "# the best_model variable. #\n", + "################################################################################\n", + "# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "\n", + "pass\n", + "\n", + "# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "################################################################################\n", + "# END OF YOUR CODE #\n", + "################################################################################" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Your Model!\n", + "Run your best model on the validation and test sets. You should achieve at least 50% accuracy on the validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_test_pred = np.argmax(best_model.loss(data['X_test']), axis=1)\n", + "y_val_pred = np.argmax(best_model.loss(data['X_val']), axis=1)\n", + "print('Validation set accuracy: ', (y_val_pred == data['y_val']).mean())\n", + "print('Test set accuracy: ', (y_test_pred == data['y_test']).mean())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cs231n", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "vscode": { + "interpreter": { + "hash": "5e00141dcb59c9550fed165717cca77960dd1735615929409445b81de918a119" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}