From cbd9798447ec5832b93ca9acdd462bf397c9f2d0 Mon Sep 17 00:00:00 2001
From: JJ <jeongjae1996@naver.com>
Date: Thu, 9 Feb 2023 05:56:31 +0900
Subject: [PATCH] [feat] implement 'sgd_momentum' function #1

---
 cs231n_2022/assignment2/cs231n/optim.py | 163 ++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 cs231n_2022/assignment2/cs231n/optim.py

diff --git a/cs231n_2022/assignment2/cs231n/optim.py b/cs231n_2022/assignment2/cs231n/optim.py
new file mode 100644
index 0000000..c7be715
--- /dev/null
+++ b/cs231n_2022/assignment2/cs231n/optim.py
@@ -0,0 +1,163 @@
+import numpy as np
+
+"""
+This file implements various first-order update rules that are commonly used
+for training neural networks. Each update rule accepts current weights and the
+gradient of the loss with respect to those weights and produces the next set of
+weights. Each update rule has the same interface:
+
+def update(w, dw, config=None):
+
+Inputs:
+  - w: A numpy array giving the current weights.
+  - dw: A numpy array of the same shape as w giving the gradient of the
+    loss with respect to w.
+  - config: A dictionary containing hyperparameter values such as learning
+    rate, momentum, etc. If the update rule requires caching values over many
+    iterations, then config will also hold these cached values.
+
+Returns:
+  - next_w: The next point after the update.
+  - config: The config dictionary to be passed to the next iteration of the
+    update rule.
+
+NOTE: For most update rules, the default learning rate will probably not
+perform well; however the default values of the other hyperparameters should
+work well for a variety of different problems.
+
+For efficiency, update rules may perform in-place updates, mutating w and
+setting next_w equal to w.
+"""
+
+
+def sgd(w, dw, config=None):
+    """
+    Performs vanilla stochastic gradient descent.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+
+    w -= config["learning_rate"] * dw
+    return w, config
+
+
+def sgd_momentum(w, dw, config=None):
+    """
+    Performs stochastic gradient descent with momentum.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - momentum: Scalar between 0 and 1 giving the momentum value.
+      Setting momentum = 0 reduces to sgd.
+    - velocity: A numpy array of the same shape as w and dw used to store a
+      moving average of the gradients.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+    config.setdefault("momentum", 0.9)
+    v = config.get("velocity", np.zeros_like(w))
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the momentum update formula. Store the updated value in #
+    # the next_w variable. You should also use and update the velocity v.     #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    v = config['momentum'] * v - config['learning_rate'] * dw
+    next_w = w + v
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    config["velocity"] = v
+
+    return next_w, config
+
+
+def rmsprop(w, dw, config=None):
+    """
+    Uses the RMSProp update rule, which uses a moving average of squared
+    gradient values to set adaptive per-parameter learning rates.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
+      gradient cache.
+    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
+    - cache: Moving average of second moments of gradients.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+    config.setdefault("decay_rate", 0.99)
+    config.setdefault("epsilon", 1e-8)
+    config.setdefault("cache", np.zeros_like(w))
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the RMSprop update formula, storing the next value of w #
+    # in the next_w variable. Don't forget to update cache value stored in    #
+    # config['cache'].                                                        #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return next_w, config
+
+
+def adam(w, dw, config=None):
+    """
+    Uses the Adam update rule, which incorporates moving averages of both the
+    gradient and its square and a bias correction term.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - beta1: Decay rate for moving average of first moment of gradient.
+    - beta2: Decay rate for moving average of second moment of gradient.
+    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
+    - m: Moving average of gradient.
+    - v: Moving average of squared gradient.
+    - t: Iteration number.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-3)
+    config.setdefault("beta1", 0.9)
+    config.setdefault("beta2", 0.999)
+    config.setdefault("epsilon", 1e-8)
+    config.setdefault("m", np.zeros_like(w))
+    config.setdefault("v", np.zeros_like(w))
+    config.setdefault("t", 0)
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the Adam update formula, storing the next value of w in #
+    # the next_w variable. Don't forget to update the m, v, and t variables   #
+    # stored in config.                                                       #
+    #                                                                         #
+    # NOTE: In order to match the reference output, please modify t _before_  #
+    # using it in any calculations.                                           #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return next_w, config