-
Notifications
You must be signed in to change notification settings - Fork 0
/
foward_feed_neural_network.py
312 lines (211 loc) · 10.7 KB
/
foward_feed_neural_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import numpy as np
import matplotlib.pyplot as plt
from training_model import TrainingModel
DEBUG_GRAD = 0
class NeuralNetwork(TrainingModel):
'''
basic foward feed neural network
'''
def __init__(self, layers, X, y, learning_rate = 1, weight_decay = 0, activation_fn=TrainingModel.sigmoid, number_of_epochs=10, plot_cost_graph=False, batch_size=100):
'''
inputs:
layers - A list containing the number of nodes in each
layer.
X - inputs for the neural network. This is a numpy
array of shape (#examples X #features). Note:
#features must be equal to the value first index of
layers
y - outputs for the neural network. This is a numpy
array of shape (#examples X #outputs). Note 1:
#outputs must be equal to the value last index of
layers. Note 2: the number of rows in X must match
the number of rows in y.
learning_rate - A floating point number. How aggressively the
weights of the neural network should be updated.
weight_decay - A floating point number. Regularization term
activation_fn - One of the functions defined in TrainingModel.
Options are:
* sigmoid
* tanh
* relu
number_of_epochs - Interger. Number of iterations to run
plot_cost_graph - A boolean deciding if the cost function should be
plotted, good for debugging.
number_of_batches - Interger. The number of batches to train with. The
larger number the slower the model will train; the
smaller the number, the faster the number of, but
the more ram is used. This should be between 1 and
the number of examples.
'''
# validate inputs
if X.shape[0] != y.shape[0]:
print("number of examples in input does not match number of examples in output!")
return
self.plot_cost_graph = plot_cost_graph
self.layers = layers
self.number_of_epochs = number_of_epochs
self.learning_rate = learning_rate
self.weight_decay = weight_decay
self.X = X
self.y = y
# weights is a list of matricies. each matrix is of shape (layer X next layer)
self.weights = []
# biases is an array of shape (1 X next_layer)
self.biases = []
# calculate weights, place these in a list of np arrays
for layer, next_layer in zip(layers, layers[1:]):
# use xavier initialization
self.weights.append(1/np.power(layer, 1/2) * np.random.randn(layer, next_layer))
# bias value for everything except the output layer
self.biases.append(np.random.randn(next_layer).reshape(1, next_layer))
# gradient
self.grad = []
# map of functions and their deratives
derivative_dict = {self.tanh: self.tanh_prime, self.sigmoid: self.sigmoid_prime, self.relu: self.relu_prime}
self.activation_fn = activation_fn
self.activation_fn_prime = derivative_dict[activation_fn]
self.batch_size = batch_size
self.number_of_batches = int(np.round(self.X.shape[0]/self.batch_size))
@staticmethod
def foward_feed(x, weights, biases, activation_fn):
'''
given starting values in X, calculate the values of each neuron in the
neural network in each layer.
returns a list of the neuron values by layer
'''
# a is the input values of the current layer. We will start with X
layer_activations = []
layer_output = []
a = x
layer_output.append(a)
for weight, bias in zip(weights, biases):
z = np.matmul(a, weight) + bias
a = activation_fn(z)
layer_activations.append(z)
layer_output.append(a)
# replace our final a with softmax(z). This way our values fall into a
# distribution
y_approx = TrainingModel.softmax(z)
# update last layer
layer_output[-1] = y_approx
return layer_activations, layer_output, y_approx
def cost(self):
'''
compute to cost
'''
# TODO: the cost is currently being calculated assuming the logistic
# function is being used of activation. This should be more flexible
# number of examples
m = self.X.shape[0]
reg_term = 0
for weight in self.weights:
reg_term += np.sum(np.power(weight, 2))
_, _, y_approx = self.foward_feed(self.X, self.weights, self.biases, self.activation_fn)
# quadratic cost function
#cost = 1/m * np.sum(1/2 * np.power(y_approx - self.y, 2)) + self.weight_decay / 2 * reg_term
# cross entropy cost function
cost = - 1/m * np.sum(self.y * np.log(y_approx) + (1 - self.y) * np.log(1 - y_approx)) + self.weight_decay/(2*m) * reg_term
return cost
def grad_check(self):
'''
calculate the gradient using divided differences. This is to make sure
the gradient we are using is sane
'''
m = self.X.shape[0]
epislon = .0001
grads_approx = []
low_weights = [np.copy(w) for w in self.weights]
high_weights = [np.copy(w) for w in self.weights]
for k, weight in enumerate(self.weights):
grad_approx = np.zeros(weight.shape)
for i in range(weight.shape[0]):
for j in range(weight.shape[1]):
low_weights[k][i, j] -= epislon
high_weights[k][i, j] += epislon
low_reg_term = 0
for low_weight in low_weights:
low_reg_term += np.sum(np.power(low_weight, 2))
_, _, low_approx = self.foward_feed(self.X, low_weights, self.biases, self.activation_fn)
high_reg_term = 0
for high_weight in high_weights:
high_reg_term += np.sum(np.power(high_weight, 2))
_, _, high_approx = self.foward_feed(self.X, high_weights, self.biases, self.activation_fn)
# TODO: add weight decay into these calculations
low_cost = - np.sum(self.y * np.log(low_approx) + (1 - self.y) * np.log(1 - low_approx)) #+ self.weight_decay / 2 * low_reg_term
high_cost = - np.sum(self.y * np.log(high_approx) + (1 - self.y) * np.log(1 - high_approx)) #+ self.weight_decay / 2 * high_reg_term
grad_approx[i, j] = (high_cost - low_cost) / (2 * epislon)
# re adjust values for next iteration
low_weights[k][i, j] += epislon
high_weights[k][i, j] -= epislon
grads_approx.append(grad_approx)
return grads_approx
def back_prop(self, x, y):
'''
perform back propigation to updated the weights
x - numpy array of shape (batch size X #number of features) a batch of
inputs to train with
y - numpy array of shape (batch size X #number of outputs) the batch of
corresponding outputs to train with
'''
# the list of deltas. There is one for every layer
delta = [None] * len(self.layers)
# the list of gradients. There is one for every weight
grad = [None] * (len(self.layers) - 1)
# the list of the bias gradients. There is one for every bias
bias_grad = [None] * (len(self.layers) - 1)
# get the set of activations a and outputs z.
#
# the first a = X. the subsequent a's are computed. There will be an a
# for every layer
#
# the z's are computed from the a's. The last a is not used to compute
# z, however, so there is a z for every layer except for first one (the
# first z belongs to the second layer).
z, a, _ = self.foward_feed(x, self.weights, self.biases, self.activation_fn)
# pad z's to since there is no z for the first layer
z.insert(0, None)
# shorthand for clarity
W = self.weights
# number of layers - 1 since we are indexing starting at 0
n = len(self.layers) - 1
m = x.shape[0]
# delta to be used when using quadratic error function
#delta[n] = (a[n] - y) * self.activation_fn_prime(z[n])
# delta to be used when using cross entropy function
delta[n] = 1/m * (a[n] - y)
for l in range(n - 1, 0, -1):
delta[l] = np.matmul(delta[l + 1] , W[l].T) * self.activation_fn_prime(z[l])
for l in range(n - 1, -1, -1):
grad[l] = np.matmul(a[l].T, delta[l + 1])
bias_grad[l] = np.sum(delta[l + 1], axis=0)
self.grad = grad
self.bias_grad = bias_grad
def train_model(self):
m = self.X.shape[0]
for epoch in range(self.number_of_epochs):
start_of_batch = 0
if DEBUG_GRAD > 0:
# verify gradient is correct. commented out on purpose
grads_approx = self.grad_check()
for i in range(self.number_of_batches):
end_of_batch = min(start_of_batch + self.batch_size, self.X.shape[0])
self.back_prop(self.X[start_of_batch:end_of_batch,:], self.y[start_of_batch:end_of_batch,:])
start_of_batch = start_of_batch + self.batch_size
for j, _ in enumerate(self.weights):
if DEBUG_GRAD > 0:
print("\nself.grad")
print(self.grad[j])
print("\ngrads_approx")
print(grads_approx[j])
print("\ndifference")
print(self.grad[j] - grads_approx[j])
# TODO: check that the weight decay term is correct here
self.weights[j] = (1 - self.learning_rate * self.weight_decay / m) * self.weights[j] - self.learning_rate * self.grad[j]
self.biases[j] = self.biases[j] - self.learning_rate * ( self.bias_grad[j])
if self.plot_cost_graph:
if epoch % np.ceil(self.number_of_epochs/100) == 0:
plt.plot(epoch, self.cost(), 'ro')
if epoch % np.ceil(self.number_of_epochs/10) == 0:
print(self.cost())
if self.plot_cost_graph:
plt.show()