-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[feat] implement 'sgd_momentum' function #1
- Loading branch information
1 parent
301f4ec
commit cbd9798
Showing
1 changed file
with
163 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import numpy as np | ||
|
||
""" | ||
This file implements various first-order update rules that are commonly used | ||
for training neural networks. Each update rule accepts current weights and the | ||
gradient of the loss with respect to those weights and produces the next set of | ||
weights. Each update rule has the same interface: | ||
def update(w, dw, config=None): | ||
Inputs: | ||
- w: A numpy array giving the current weights. | ||
- dw: A numpy array of the same shape as w giving the gradient of the | ||
loss with respect to w. | ||
- config: A dictionary containing hyperparameter values such as learning | ||
rate, momentum, etc. If the update rule requires caching values over many | ||
iterations, then config will also hold these cached values. | ||
Returns: | ||
- next_w: The next point after the update. | ||
- config: The config dictionary to be passed to the next iteration of the | ||
update rule. | ||
NOTE: For most update rules, the default learning rate will probably not | ||
perform well; however the default values of the other hyperparameters should | ||
work well for a variety of different problems. | ||
For efficiency, update rules may perform in-place updates, mutating w and | ||
setting next_w equal to w. | ||
""" | ||
|
||
|
||
def sgd(w, dw, config=None): | ||
""" | ||
Performs vanilla stochastic gradient descent. | ||
config format: | ||
- learning_rate: Scalar learning rate. | ||
""" | ||
if config is None: | ||
config = {} | ||
config.setdefault("learning_rate", 1e-2) | ||
|
||
w -= config["learning_rate"] * dw | ||
return w, config | ||
|
||
|
||
def sgd_momentum(w, dw, config=None): | ||
""" | ||
Performs stochastic gradient descent with momentum. | ||
config format: | ||
- learning_rate: Scalar learning rate. | ||
- momentum: Scalar between 0 and 1 giving the momentum value. | ||
Setting momentum = 0 reduces to sgd. | ||
- velocity: A numpy array of the same shape as w and dw used to store a | ||
moving average of the gradients. | ||
""" | ||
if config is None: | ||
config = {} | ||
config.setdefault("learning_rate", 1e-2) | ||
config.setdefault("momentum", 0.9) | ||
v = config.get("velocity", np.zeros_like(w)) | ||
|
||
next_w = None | ||
########################################################################### | ||
# TODO: Implement the momentum update formula. Store the updated value in # | ||
# the next_w variable. You should also use and update the velocity v. # | ||
########################################################################### | ||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
|
||
v = config['momentum'] * v - config['learning_rate'] * dw | ||
next_w = w + v | ||
|
||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
########################################################################### | ||
# END OF YOUR CODE # | ||
########################################################################### | ||
config["velocity"] = v | ||
|
||
return next_w, config | ||
|
||
|
||
def rmsprop(w, dw, config=None): | ||
""" | ||
Uses the RMSProp update rule, which uses a moving average of squared | ||
gradient values to set adaptive per-parameter learning rates. | ||
config format: | ||
- learning_rate: Scalar learning rate. | ||
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared | ||
gradient cache. | ||
- epsilon: Small scalar used for smoothing to avoid dividing by zero. | ||
- cache: Moving average of second moments of gradients. | ||
""" | ||
if config is None: | ||
config = {} | ||
config.setdefault("learning_rate", 1e-2) | ||
config.setdefault("decay_rate", 0.99) | ||
config.setdefault("epsilon", 1e-8) | ||
config.setdefault("cache", np.zeros_like(w)) | ||
|
||
next_w = None | ||
########################################################################### | ||
# TODO: Implement the RMSprop update formula, storing the next value of w # | ||
# in the next_w variable. Don't forget to update cache value stored in # | ||
# config['cache']. # | ||
########################################################################### | ||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
|
||
pass | ||
|
||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
########################################################################### | ||
# END OF YOUR CODE # | ||
########################################################################### | ||
|
||
return next_w, config | ||
|
||
|
||
def adam(w, dw, config=None): | ||
""" | ||
Uses the Adam update rule, which incorporates moving averages of both the | ||
gradient and its square and a bias correction term. | ||
config format: | ||
- learning_rate: Scalar learning rate. | ||
- beta1: Decay rate for moving average of first moment of gradient. | ||
- beta2: Decay rate for moving average of second moment of gradient. | ||
- epsilon: Small scalar used for smoothing to avoid dividing by zero. | ||
- m: Moving average of gradient. | ||
- v: Moving average of squared gradient. | ||
- t: Iteration number. | ||
""" | ||
if config is None: | ||
config = {} | ||
config.setdefault("learning_rate", 1e-3) | ||
config.setdefault("beta1", 0.9) | ||
config.setdefault("beta2", 0.999) | ||
config.setdefault("epsilon", 1e-8) | ||
config.setdefault("m", np.zeros_like(w)) | ||
config.setdefault("v", np.zeros_like(w)) | ||
config.setdefault("t", 0) | ||
|
||
next_w = None | ||
########################################################################### | ||
# TODO: Implement the Adam update formula, storing the next value of w in # | ||
# the next_w variable. Don't forget to update the m, v, and t variables # | ||
# stored in config. # | ||
# # | ||
# NOTE: In order to match the reference output, please modify t _before_ # | ||
# using it in any calculations. # | ||
########################################################################### | ||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
|
||
pass | ||
|
||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | ||
########################################################################### | ||
# END OF YOUR CODE # | ||
########################################################################### | ||
|
||
return next_w, config |