forked from crazydonkey200/minimal-nn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfnn.py
338 lines (264 loc) · 12.1 KB
/
fnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import numpy as np
######################################
# Feedforward Neural network (FNN)
######################################
class FNN(object):
def __init__(self, input_dim, output_dim, sizes, activ_funcs):
"""Feedforward Neural network for multi-class classification.
The object holds a list of layer objects, each one
implements a layer in the network, the specification
of each layer is decided by input_dim, output_dim,
sizes and activ_funcs. Note that an output layer
(linear) and loss function (softmax and
cross-entropy) would be automatically added to the
FNN.
Input:
input_dim: dimension of input.
output_dim: dimension of output (number of labels).
sizes: a list of integers specifying the number of
hidden units on each layer.
activ_funcs: a list of function objects specifying
the activation function of each layer.
"""
# Last layer is linear and loss is mean_cross_entropy_softmax
self.sizes = [input_dim] + sizes[:] + [output_dim]
self.activ_funcs = activ_funcs[:] + [linear]
self.shapes = []
for i in xrange(len(self.sizes)-1):
self.shapes.append((self.sizes[i], self.sizes[i+1]))
self.layers = []
for i, shape in enumerate(self.shapes):
self.layers.append(Layer(shape, self.activ_funcs[i]))
def forwardprop(self, data, labels=None):
"""Forward propagate the activations through the network.
Iteratively propagate the activations (starting from
input data) through each layer, and output a
probability distribution among labels (probs), and
if labels are given, also compute the loss.
"""
inputs = data
for layer in self.layers:
outputs = layer.forward(inputs)
inputs = outputs
probs = softmax(outputs)
if labels is not None:
return probs, self.loss(outputs, labels)
else:
return probs, None
def backprop(self, labels):
"""Backward propagate the gradients/derivatives through the network.
Iteratively propagate the gradients/derivatives (starting from
outputs) through each layer, and save gradients/derivatives of
each parameter (weights or bias) in the layer.
"""
d_outputs = self.d_loss(self.layers[-1].a, labels)
for layer in self.layers[::-1]:
d_inputs = layer.backward(d_outputs)
d_outputs = d_inputs
def loss(self, outputs, labels):
"Compute the cross entropy softmax loss."
return mean_cross_entropy_softmax(outputs, labels)
def d_loss(self, outputs, labels):
"Compute derivatives of the cross entropy softmax loss w.r.t the outputs."
return d_mean_cross_entropy_softmax(outputs, labels)
def predict(self, data):
"Predict the labels of the data."
probs, _ = self.forwardprop(data)
return np.argmax(probs, axis=1)
class Layer(object):
def __init__(self, shape, activ_func):
"Implements a layer of a NN."
self.w = np.random.uniform(-np.sqrt(2.0 / shape[0]),
np.sqrt(2.0 / shape[0]),
size=shape)
self.b = np.zeros((1, shape[1]))
# The activation function, for example, RELU, tanh
# or sigmoid.
self.activate = activ_func
# The derivative of the activation function.
self.d_activate = GRAD_DICT[activ_func]
def forward(self, inputs):
"""Forward propagate the activation through the layer.
Given the inputs (activation of previous layers),
compute and save the activation of current layer,
then return it as output.
"""
###################################
# Question 1
# Instructions
# Use the linear and non-linear transformation to
# compute the activation and cache it in a the field, self.a.
# Functions you will use:
# np.dot: numpy function to compute dot product of two matrix.
# self.activate: the activation function of this layer,
# it takes in a matrix of scores (linear transformation)
# and compute the activations (non-linear transformation).
# (plus the common arithmetic functions).
# For all the numpy functions, use google and numpy manual for
# more details and examples.
# Object fields you will use:
# self.w:
# weight matrix, a matrix with shape (H_-1, H).
# H_-1 is the number of hidden units in previous layer
# H is the number of hidden units in this layer
# self.b: bias, a matrix/vector with shape (1, H).
# self.activate: the activation function of this layer.
# Input:
# inputs:
# a matrix with shape (N, H_-1),
# N is the number of data points.
# H_-1 is the number of hidden units in previous layer
# Code you need to fill in: 2 lines.
#########################################################
# Modify the right hand side of the following code.
# The linear transformation.
# scores:
# weighted sum of inputs plus bias, a matrix of shape (N, H).
# N is the number of data points.
# H is the number of hidden units in this layer.
scores = np.zeros((inputs.shape[0], self.w.shape[1]))
# The non-linear transformation.
# outputs:
# activations of this layer, a matrix of shape (N, H).
# N is the number of data points.
# H is the number of hidden units in this layer.
activations = np.zeros_like(scores)
# End of the code to modify
#########################################################
# Cache the inputs, will be used by backforward
# function during backprop.
self.inputs = inputs
# Cache the activations, to be used by backprop.
self.a = activations
outputs = activations
return outputs
def backward(self, d_outputs):
"""Backward propagate the gradient through this layer.
Given the gradient w.r.t the output of this layer
(d_outputs), compute and save the gradient w.r.t the
weights (d_w) and bias (d_b) of this layer and
return the gradient w.r.t the inputs (d_inputs).
"""
###################################
# Question 2
# Instructions
# Compute the derivatives of the loss w.r.t the weights and bias
# given the derivatives of the loss w.r.t the outputs of this layer
# using chain rule.
# Naming convention: use d_var to store the
# derivative of the loss w.r.t the variable.
# Functions you will use:
# np.dot (numpy.dot): numpy function to compute dot product of two matrix.
# np.mean or np.sum (numpy.mean or numpy.sum):
# numpy function to compute the mean or sum of a matrix,
# use keywords argument 'axis' to compute the mean
# or sum along a particular axis, you might also
# found 'keepdims' argument useful.
# self.d_activate:
# given the current activation (self.a) as input,
# compute the derivative of the activation function,
# See d_relu as an example.
# (plus the common arithmetic functions).
# np.transpose or m.T (m is an numpy array): transpose a matrix.
# For all the numpy functions, use google and numpy manual for
# more details and examples.
# Object fields you will use:
# self.w: weight matrix, a matrix with shape (H_-1, H).
# H_-1 is the number of hidden units in previous layer
# H is the number of hidden units in this layer
# self.d_activate: compute derivative of the activation function.
# See d_relu as an example.
# d_outputs: the derivative of the loss w.r.t the outputs of
# this layer, a matrix of shape (N, H). N is the number of
# data points and H is the number of hidden units in this layer.
# self.inputs: inputs to this layer, a matrix with shape (N, H_-1)
# N is the number of data points.
# H_-1 is the number of hidden units in previous layer.
# self.a: activation of the hidden units of this layer, a matrix
# with shape (N, H)
# N is the number of data points.
# H is the number of hidden units in this layer.
# Code you need to write: 4 lines.
###################################
# Modify the right hand side of the following code.
# d_scores:
# Derivatives of the loss w.r.t the scores (the result from linear transformation).
# A matrix of shape (N, H)
# N is the number of data points.
# H is the number of hidden units in this layer.
d_scores = np.zeros_like(self.a)
# self.d_b:
# Derivatives of the loss w.r.t the bias, averaged over all data points.
# A matrix of shape (1, H)
# H is the number of hidden units in this layer.
self.d_b = np.zeros_like(self.b)
# self.d_w:
# Derivatives of the loss w.r.t the weight matrix, averaged over all data points.
# A matrix of shape (H_-1, H)
# H_-1 is the number of hidden units in previous layer
# H is the number of hidden units in this layer.
self.d_w = np.zeros_like(self.w)
# d_inputs:
# Derivatives of the loss w.r.t the previous layer's activations/outputs.
# A matrix of shape (N, H_-1)
# N is the number of data points.
# H_-1 is the number of hidden units in the previous layer.
d_inputs = np.zeros([d_scores.shape[0], self.w.shape[0]])
# End of the code to modify
###################################
return d_inputs
class GradientDescentOptimizer(object):
def __init__(self, learning_rate, decay_steps=1000,
decay_rate=1.0):
"Gradient descent with staircase exponential decay."
self.learning_rate = learning_rate
self.steps = 0.0
self.decay_steps = decay_steps
self.decay_rate = decay_rate
def update(self, model):
"Update model parameters."
for layer in model.layers:
layer.w -= layer.d_w * self.learning_rate
layer.b -= layer.d_b * self.learning_rate
self.steps += 1
if (self.steps + 1) % self.decay_steps == 0:
self.learning_rate *= self.decay_rate
# Utility functions.
def relu(x):
"The rectified linear activation function."
return np.clip(x, 0.0, None)
def d_relu(a=None, x=None):
"Compute the derivative of RELU given activation (a) or input (x)."
if a is not None:
d = np.zeros_like(a)
d[np.where(a > 0.0)] = 1.0
return d
else:
return d_relu(a=relu(x))
def tanh(x):
"The tanh activation function."
return np.tanh(x)
def d_tanh(a=None, x=None):
"The derivative of the tanh function."
if a is not None:
return 1 - a ** 2
else:
return d_tanh(a=tanh(x))
def softmax(x):
shifted_x = x - np.max(x, axis=1, keepdims=True)
f = np.exp(shifted_x)
p = f / np.sum(f, axis=1, keepdims=True)
return p
def mean_cross_entropy(outputs, labels):
n = labels.shape[0]
return - np.sum(labels * np.log(outputs)) / n
def mean_cross_entropy_softmax(logits, labels):
return mean_cross_entropy(softmax(logits), labels)
def d_mean_cross_entropy_softmax(logits, labels):
return softmax(logits) - labels
def linear(x):
return x
def d_linear(a=None, x=None):
return 1.0
# Mapping from activation functions to its derivatives.
GRAD_DICT = {relu: d_relu, tanh: d_tanh, linear: d_linear}