-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrnn.py
777 lines (603 loc) · 27 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
# coding: utf-8
import sys
import time
import numpy as np
import itertools
from utils import *
from rnnmath import *
from sys import stdout
class RNN(object):
'''
This class implements Recurrent Neural Networks.
The main functions in this class are as below:
predict -> predict an output sequence for a given input sequence
acc_deltas -> accumulate update weights for the RNNs weight matrices, standard Back Propagation
acc_deltas_bptt -> accumulate update weights for the RNNs weight matrices, using Back Propagation Through Time
acc_deltas_np -> accumulate update weights for the RNNs weight matrices, standard Back Propagation -- for number predictions
acc_deltas_bptt_np -> accumulate update weights for the RNNs weight matrices, using Back Propagation Through Time -- for number predictions
compute_loss -> compute the (cross entropy) loss between the desired output and predicted output for a given input sequence
compute_mean_loss -> compute the average loss over all sequences in a corpus
generate_sequence -> use the RNN to generate a new (unseen) sequnce
'''
def __init__(self, vocab_size, hidden_dims, out_vocab_size):
'''
initialize the RNN with random weight matrices.
vocab_size size of vocabulary that is being used
hidden_dims number of hidden units
out_vocab_size size of the output vocabulary
'''
self.vocab_size = vocab_size
self.hidden_dims = hidden_dims
self.out_vocab_size = out_vocab_size
# matrices V (input -> hidden), W (hidden -> output), U (hidden -> hidden)
self.U = np.random.randn(self.hidden_dims, self.hidden_dims)*np.sqrt(0.1)
self.V = np.random.randn(self.hidden_dims, self.vocab_size)*np.sqrt(0.1)
self.W = np.random.randn(self.out_vocab_size, self.hidden_dims)*np.sqrt(0.1)
# matrices to accumulate weight updates
self.deltaU = np.zeros((self.hidden_dims, self.hidden_dims))
self.deltaV = np.zeros((self.hidden_dims, self.vocab_size))
self.deltaW = np.zeros((self.out_vocab_size, self.hidden_dims))
self.delta_in = {}
def apply_deltas(self, learning_rate):
'''
update the RNN's weight matrices with corrections accumulated over some training instances
learning_rate scaling factor for update weights
'''
# apply updates to U, V, W
self.U += learning_rate*self.deltaU
self.W += learning_rate*self.deltaW
self.V += learning_rate*self.deltaV
# reset matrices
self.deltaU.fill(0.0)
self.deltaV.fill(0.0)
self.deltaW.fill(0.0)
def predict(self, x):
'''
predict an output sequence y for a given input sequence x
x list of words, as indices, e.g.: [0, 4, 2]
returns y,s
y matrix of probability vectors for each input word
s matrix of hidden layers for each input word
'''
# matrix s for hidden states, y for output states, given input x.
# rows correspond to times t, i.e., input words
# s has one more row, since we need to look back even at time 0 (s(t=0-1) will just be [0. 0. ....] )
s = np.zeros((len(x) + 1, self.hidden_dims))
y = np.zeros((len(x), self.out_vocab_size))
for t in range(len(x)):
x_onehot = make_onehot(x[t], self.vocab_size)
net_in = np.dot(self.V, x_onehot) + np.dot(self.U, s[t-1])
s[t] = sigmoid(net_in)
net_out = self.W.dot(s[t])
y[t] = softmax(net_out)
return y, s
def acc_deltas(self, x, d, y, s):
'''
accumulate updates for V, W, U
standard back propagation
here I won't update V, W, U directly. instead, use deltaV, deltaW, deltaU to accumulate updates over time
x list of words, as indices, e.g.: [0, 4, 2]
d list of words, as indices, e.g.: [4, 2, 3]
y predicted output layer for x; list of probability vectors, e.g., [[0.3, 0.1, 0.1, 0.5], [0.2, 0.7, 0.05, 0.05] [...]]
should be part of the return value of predict(x)
s predicted hidden layer for x; list of vectors, e.g., [[1.2, -2.3, 5.3, 1.0], [-2.1, -1.1, 0.2, 4.2], [...]]
should be part of the return value of predict(x)
no return values
'''
for t in reversed(range(len(x))):
d_vector = make_onehot(d[t], self.out_vocab_size)[:, None]
delta_out = (d_vector - y[t][:, None])
self.deltaW += delta_out.dot(s[t][None, :])
sigmoid_grad = (1 - s[t][:, None]) * s[t][:, None]
self.delta_in[t] = self.W.T.dot(delta_out) * sigmoid_grad
x_vector = make_onehot(x[t], self.vocab_size)
self.deltaV += self.delta_in[t].dot(x_vector[None, :])
self.deltaU += self.delta_in[t].dot(s[t-1][None, :])
def acc_deltas_np(self, x, d, y, s):
'''
accumulate updates for V, W, U
standard back propagation
again, not update V, W, U directly. instead, use deltaV, deltaW, deltaU to accumulate updates over time
for number prediction task, we do binary prediction, 0 or 1
x list of words, as indices, e.g.: [0, 4, 2]
d array with one element, as indices, e.g.: [0] or [1]
y predicted output layer for x; list of probability vectors, e.g., [[0.3, 0.1, 0.1, 0.5], [0.2, 0.7, 0.05, 0.05] [...]]
should be part of the return value of predict(x)
s predicted hidden layer for x; list of vectors, e.g., [[1.2, -2.3, 5.3, 1.0], [-2.1, -1.1, 0.2, 4.2], [...]]
should be part of the return value of predict(x)
no return values
'''
t = len(x) - 1
d_vector = make_onehot(d[0], self.out_vocab_size)[:, None]
delta_out = (d_vector - y[t][:, None])
self.deltaW += delta_out.dot(s[t][None, :])
sigmoid_grad = (1 - s[t][:, None]) * s[t][:, None]
self.delta_in[t] = self.W.T.dot(delta_out) * sigmoid_grad
x_vector = make_onehot(x[t], self.vocab_size)
self.deltaV += self.delta_in[t].dot(x_vector[None, :])
self.deltaU += self.delta_in[t].dot(s[t - 1][None, :])
def acc_deltas_bptt(self, x, d, y, s, steps):
'''
accumulate updates for V, W, U
back propagation through time (BPTT)
x list of words, as indices, e.g.: [0, 4, 2]
d list of words, as indices, e.g.: [4, 2, 3]
y predicted output layer for x; list of probability vectors, e.g., [[0.3, 0.1, 0.1, 0.5], [0.2, 0.7, 0.05, 0.05] [...]]
should be part of the return value of predict(x)
s predicted hidden layer for x; list of vectors, e.g., [[1.2, -2.3, 5.3, 1.0], [-2.1, -1.1, 0.2, 4.2], [...]]
should be part of the return value of predict(x)
steps number of time steps to go back in BPTT
no return values
'''
for t in reversed(range(len(x))):
d_vector = make_onehot(d[t], self.out_vocab_size)[:, None]
delta_out = (d_vector - y[t][:, None])
self.deltaW += delta_out.dot(s[t][None, :])
sigmoid_grad = (1 - s[t][:, None]) * s[t][:, None]
delta_in = self.W.T.dot(delta_out) * sigmoid_grad
for tao in range(0, min(t+1, steps+1)):
sigmoid_grad = (1 - s[t-tao][:, None]) * s[t-tao][:, None]
x_vector = make_onehot(x[t-tao], self.vocab_size)
self.deltaV += delta_in.dot(x_vector[None, :])
self.deltaU += delta_in.dot(s[t-tao-1][None, :])
delta_in = np.dot(self.U.T, delta_in) * ((1 - s[t-tao-1][:, None]) * s[t-tao-1][:, None])
def acc_deltas_bptt_np(self, x, d, y, s, steps):
'''
accumulate updates for V, W, U
back propagation through time (BPTT)
x list of words, as indices, e.g.: [0, 4, 2]
d array with one element, as indices, e.g.: [0] or [1]
y predicted output layer for x; list of probability vectors, e.g., [[0.3, 0.1, 0.1, 0.5], [0.2, 0.7, 0.05, 0.05] [...]]
should be part of the return value of predict(x)
s predicted hidden layer for x; list of vectors, e.g., [[1.2, -2.3, 5.3, 1.0], [-2.1, -1.1, 0.2, 4.2], [...]]
should be part of the return value of predict(x)
steps number of time steps to go back in BPTT
no return values
'''
t = len(x) - 1
d_vector = make_onehot(d[0], self.out_vocab_size)[:, None]
delta_out = d_vector - y[t][:, None]
self.deltaW += delta_out * s[t]
sigmoid_grad = (1 - s[t][:, None]) * s[t][:, None]
delta_in = self.W.T.dot(delta_out) * sigmoid_grad
for tao in range(min(t+1, steps+1)):
sigmoid_grad = (1 - s[t-tao][:, None]) * s[t-tao][:, None]
x_vector = make_onehot(x[t-tao], self.vocab_size)
self.deltaV += delta_in.dot(x_vector[None, :])
self.deltaU += delta_in.dot(s[t-tao-1][None, :])
delta_in = np.dot(self.U.T, delta_in) * ((1 - s[t-tao-1][:, None]) * s[t-tao-1][:, None])
def compute_loss(self, x, d):
'''
compute the loss between predictions y for x, and desired output d.
first predicts the output for x using the RNN, then computes the loss w.r.t. d
x list of words, as indices, e.g.: [0, 4, 2]
d list of words, as indices, e.g.: [4, 2, 3]
return loss the combined loss for all words
'''
loss = 0.
y = self.predict(x)[0]
for i in range(len(d)):
loss += -np.log(y[i][d[i]])
return loss
def compute_loss_np(self, x, d):
'''
compute the loss between predictions y for x, and desired output d.
first predicts the output for x using the RNN, then computes the loss w.r.t. d
x list of words, as indices, e.g.: [0, 4, 2]
d a word, as indices, e.g.: [0]
return loss we only take the prediction from the last time step
'''
loss = 0.
y = self.predict(x)[0]
for i in range(len(d)):
loss += -np.log(y[-1][d[0]])
return loss
def compute_acc_np(self, x, d):
'''
compute the accuracy prediction, y[t] compared to the desired output d.
first predicts the output for x using the RNN, then computes the loss w.r.t. d
x list of words, as indices, e.g.: [0, 4, 2]
d a word class (plural/singular), as index, e.g.: [0] or [1]
return 1 if argmax(y[t]) == d[0], 0 otherwise
'''
y = self.predict(x)[0]
if np.argmax(y[-1]) == d[0]:
return 1.0
else:
return 0
def compare_num_pred(self, x, d):
'''
compute the probability between predictions the desired output d[0] and it's (re)inflected form, d[1].
first predicts the output for x using the RNN, then compare the probability of d[0] and d[1].
x list of words, as indices, e.g.: [0, 4, 2]
d the desired verb and its (re)inflected form (singular/plural), as indices, e.g.: [7, 8]
return 1 if p(d[0]) > p(d[1]), 0 otherwise
'''
y = self.predict(x)[0]
if y[-1][d[0]] > y[-1][d[1]]:
return 1
else:
return 0
def compute_acc_lmnp(self, X_dev, D_dev):
'''
DO NOT CHANGE THIS
X_dev a list of input vectors, e.g., [[5, 4, 2], [7, 3, 8]]
D_dev a list of pair verb forms (plural/singular), e.g., [[4, 9], [6, 5]]
'''
acc = sum([self.compare_num_pred(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / len(X_dev)
return acc
def compute_mean_loss(self, X, D):
'''
compute the mean loss between predictions for corpus X and desired outputs in corpus D.
X corpus of sentences x1, x2, x3, [...], each a list of words as indices.
D corpus of desired outputs d1, d2, d3 [...], each a list of words as indices.
return mean_loss average loss over all words in D
'''
num_words = mean_loss = loss = 0.
for i in range(len(X)):
loss += self.compute_loss(X[i], D[i])
num_words += len(X[i])
mean_loss = loss / (num_words)
return mean_loss
def train(self, X, D, X_dev, D_dev, epochs=20, learning_rate=0.5, anneal=20, back_steps=0, batch_size=100, min_change=0.0001, log=True):
'''
train the RNN on some training set X, D while optimizing the loss on a dev set X_dev, D_dev
training stops after the first of the following is true:
* number of epochs reached
* minimum change observed for more than 2 consecutive epochs
X a list of input vectors, e.g., [[0, 4, 2], [1, 3, 0]]
D a list of desired outputs, e.g., [[4, 2, 3], [3, 0, 3]]
X_dev a list of input vectors, e.g., [[0, 4, 2], [1, 3, 0]]
D_dev a list of desired outputs, e.g., [[4, 2, 3], [3, 0, 3]]
epochs maximum number of epochs (iterations) over the training set. default 10
learning_rate initial learning rate for training. default 0.5
anneal positive integer. if > 0, lowers the learning rate in a harmonically after each epoch.
higher annealing rate means less change per epoch.
anneal=0 will not change the learning rate over time.
default 5
back_steps positive integer. number of timesteps for BPTT. if back_steps < 2, standard BP will be used. default 0
batch_size number of training instances to use before updating the RNN's weight matrices.
if set to 1, weights will be updated after each instance. if set to len(X), weights are only updated after each epoch.
default 100
min_change minimum change in loss between 2 epochs. if the change in loss is smaller than min_change, training stops regardless of
number of epochs left.
default 0.0001
log whether or not to print out log messages. (default log=True)
'''
if log:
stdout.write("\nTraining model for {0} epochs\ntraining set: {1} sentences (batch size {2})".format(epochs, len(X), batch_size))
stdout.write("\nOptimizing loss on {0} sentences".format(len(X_dev)))
stdout.write("\nVocab size: {0}\nHidden units: {1}".format(self.vocab_size, self.hidden_dims))
stdout.write("\nSteps for back propagation: {0}".format(back_steps))
stdout.write("\nInitial learning rate set to {0}, annealing set to {1}".format(learning_rate, anneal))
stdout.write("\n\ncalculating initial mean loss on dev set")
stdout.flush()
t_start = time.time()
loss_function = self.compute_loss
loss_sum = sum([len(d) for d in D_dev])
initial_loss = sum([loss_function(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / loss_sum
if log or not log:
stdout.write(": {0}\n".format(initial_loss))
stdout.flush()
prev_loss = initial_loss
loss_watch_count = -1
min_change_count = -1
a0 = learning_rate
best_loss = initial_loss
bestU, bestV, bestW = self.U, self.V, self.W
best_epoch = 0
with open('/afs/inf.ed.ac.uk/user/s18/s1853165/nlu-coursework/code/lr={}hidden={}step={}.txt'.format(learning_rate, self.hidden_dims, back_steps), 'w') as fr:
for epoch in range(epochs):
if anneal > 0:
learning_rate = a0/((epoch+0.0+anneal)/anneal)
else:
learning_rate = a0
if log:
stdout.write("\nepoch %d, learning rate %.04f" % (epoch+1, learning_rate))
stdout.flush()
t0 = time.time()
count = 0
# use random sequence of instances in the training set (tries to avoid local maxima when training on batches)
permutation = np.random.permutation(range(len(X)))
if log:
stdout.write("\tinstance 1")
for i in range(len(X)):
c = i+1
if log:
stdout.write("\b"*len(str(i)))
stdout.write("{0}".format(c))
stdout.flush()
p = permutation[i]
x_p = X[p]
d_p = D[p]
y_p, s_p = self.predict(x_p)
if back_steps == 0:
self.acc_deltas(x_p, d_p, y_p, s_p)
else:
self.acc_deltas_bptt(x_p, d_p, y_p, s_p, back_steps)
if i % batch_size == 0:
self.deltaU /= batch_size
self.deltaV /= batch_size
self.deltaW /= batch_size
self.apply_deltas(learning_rate)
if len(X) % batch_size > 0:
mod = len(X) % batch_size
self.deltaU /= mod
self.deltaV /= mod
self.deltaW /= mod
self.apply_deltas(learning_rate)
loss = sum([loss_function(X_dev[i], D_dev[i]) for i in range(len(X_dev))])/loss_sum
if log:
stdout.write("\tepoch done in %.02f seconds" % (time.time() - t0))
stdout.write("\tnew loss: {0}".format(loss))
stdout.flush()
if loss < best_loss:
best_loss = loss
bestU, bestV, bestW = self.U.copy(), self.V.copy(), self.W.copy()
best_epoch = epoch
fr.write('\nepoch %d, learning rate %.04f' % (epoch+1, learning_rate))
fr.write("\tnew loss: {0}".format(loss))
# make sure we change the RNN enough
if abs(prev_loss - loss) < min_change:
min_change_count += 1
else:
min_change_count = 0
if min_change_count > 2:
print("\n\ntraining finished after {0} epochs due to minimal change in loss".format(epoch+1))
break
prev_loss = loss
fr.write('\nbest epoch: {}\tbest loss: {}'.format(best_epoch, best_loss))
loss = sum([loss_function(X_test[i], D_test[i]) for i in range(len(X_test))]) / loss_sum
fr.close()
t = time.time() - t_start
if min_change_count <= 2:
print("\n\ntraining finished after reaching maximum of {0} epochs".format(epochs))
print("best observed loss was {0}, at epoch {1}".format(best_loss, (best_epoch+1)))
print("setting U, V, W to matrices from best epoch")
self.U, self.V, self.W = bestU, bestV, bestW
label = ['U', 'V', 'W']
write_in = [self.U, self.V, self.W]
for i in range(3):
np.save('rnn.{}.npy' .format(label[i]), write_in[i])
return best_loss, loss
def train_np(self, X, D, X_dev, D_dev, X_test, D_test, epochs=20, learning_rate=0.5, anneal=20, back_steps=0, batch_size=100, min_change=0.0001, log=True):
'''
train the RNN on some training set X, D while optimizing the loss on a dev set X_dev, D_dev
training stops after the first of the following is true:
* number of epochs reached
* minimum change observed for more than 2 consecutive epochs
X a list of input vectors, e.g., [[5, 4, 2], [7, 3, 8]]
D a list of desired outputs, e.g., [[0], [1]]
X_dev a list of input vectors, e.g., [[5, 4, 2], [7, 3, 8]]
D_dev a list of desired outputs, e.g., [[0], [1]]
epochs maximum number of epochs (iterations) over the training set. default 10
learning_rate initial learning rate for training. default 0.5
anneal positive integer. if > 0, lowers the learning rate in a harmonically after each epoch.
higher annealing rate means less change per epoch.
anneal=0 will not change the learning rate over time.
default 5
back_steps positive integer. number of timesteps for BPTT. if back_steps < 2, standard BP will be used. default 0
batch_size number of training instances to use before updating the RNN's weight matrices.
if set to 1, weights will be updated after each instance. if set to len(X), weights are only updated after each epoch.
default 100
min_change minimum change in loss between 2 epochs. if the change in loss is smaller than min_change, training stops regardless of
number of epochs left.
default 0.0001
log whether or not to print out log messages. (default log=True)
'''
if log:
stdout.write("\nTraining model for {0} epochs\ntraining set: {1} sentences (batch size {2})".format(epochs, len(X), batch_size))
stdout.write("\nOptimizing loss on {0} sentences".format(len(X_dev)))
stdout.write("\nVocab size: {0}\nHidden units: {1}".format(self.vocab_size, self.hidden_dims))
stdout.write("\nSteps for back propagation: {0}".format(back_steps))
stdout.write("\nInitial learning rate set to {0}, annealing set to {1}".format(learning_rate, anneal))
stdout.flush()
t_start = time.time()
loss_function = self.compute_loss_np
loss_sum = len(D_dev)
initial_loss = sum([loss_function(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / loss_sum
initial_acc = sum([self.compute_acc_np(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / len(X_dev)
if log or not log:
stdout.write("\n\ncalculating initial mean loss on dev set")
stdout.write(": {0}\n".format(initial_loss))
stdout.write("calculating initial acc on dev set")
stdout.write(": {0}\n".format(initial_acc))
stdout.flush()
prev_loss = initial_loss
loss_watch_count = -1
min_change_count = -1
a0 = learning_rate
best_loss = initial_loss
bestU, bestV, bestW = self.U, self.V, self.W
best_epoch = 0
for epoch in range(epochs):
if anneal > 0:
learning_rate = a0/((epoch+0.0+anneal)/anneal)
else:
learning_rate = a0
if log:
stdout.write("\nepoch %d, learning rate %.04f" % (epoch+1, learning_rate))
stdout.flush()
t0 = time.time()
count = 0
# use random sequence of instances in the training set (tries to avoid local maxima when training on batches)
permutation = np.random.permutation(range(len(X)))
if log:
stdout.write("\tinstance 1")
for i in range(len(X)):
c = i+1
if log:
stdout.write("\b"*len(str(i)))
stdout.write("{0}".format(c))
stdout.flush()
p = permutation[i]
x_p = X[p]
d_p = D[p]
y_p, s_p = self.predict(x_p)
if back_steps == 0:
self.acc_deltas_np(x_p, d_p, y_p, s_p)
else:
self.acc_deltas_bptt_np(x_p, d_p, y_p, s_p, back_steps)
if i % batch_size == 0:
self.deltaU /= batch_size
self.deltaV /= batch_size
self.deltaW /= batch_size
self.apply_deltas(learning_rate)
if len(X) % batch_size > 0:
mod = len(X) % batch_size
self.deltaU /= mod
self.deltaV /= mod
self.deltaW /= mod
self.apply_deltas(learning_rate)
loss = sum([loss_function(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / loss_sum
acc = sum([self.compute_acc_np(X_dev[i], D_dev[i]) for i in range(len(X_dev))]) / len(X_dev)
if log:
stdout.write("\tepoch done in %.02f seconds" % (time.time() - t0))
stdout.write("\tnew loss: {0}".format(loss))
stdout.write("\tnew acc: {0}".format(acc))
stdout.flush()
if loss < best_loss:
best_loss = loss
best_acc = acc
bestU, bestV, bestW = self.U.copy(), self.V.copy(), self.W.copy()
best_epoch = epoch
# make sure we change the RNN enough
if abs(prev_loss - loss) < min_change:
min_change_count += 1
else:
min_change_count = 0
if min_change_count > 2:
print("\n\ntraining finished after {0} epochs due to minimal change in loss".format(epoch+1))
break
prev_loss = loss
t = time.time() - t_start
# if min_change_count <= 2:
# print("\n\ntraining finished after reaching maximum of {0} epochs".format(epochs))
# print("best observed loss was {0}, acc {1}, at epoch {2}".format(best_loss, best_acc, (best_epoch+1)))
#
# print("setting U, V, W to matrices from best epoch")
self.U, self.V, self.W = bestU, bestV, bestW
loss_sum = len(D_test)
loss = sum([loss_function(X_test[i], D_test[i]) for i in range(len(X_test))]) / loss_sum
acc = sum([self.compute_acc_np(X_test[i], D_test[i]) for i in range(len(X_test))]) / len(X_test)
return loss, acc
if __name__ == "__main__":
mode = sys.argv[1].lower()
data_folder = sys.argv[2]
hdim = int(sys.argv[3])
lookback = int(sys.argv[4])
lr = float(sys.argv[5])
np.random.seed(2019)
if mode == "train-lm":
'''
code for training language model.
'''
train_size = 25000
dev_size = 1000
vocab_size = 2000
# get the data set vocabulary
vocab = pd.read_table(data_folder + "/vocab.wiki.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], )
num_to_word = dict(enumerate(vocab.index[:vocab_size]))
word_to_num = invert_dict(num_to_word)
# calculate loss vocabulary words due to vocab_size
fraction_lost = fraq_loss(vocab, word_to_num, vocab_size)
print("Retained %d words from %d (%.02f%% of all tokens)\n" % (vocab_size, len(vocab), 100*(1-fraction_lost)))
docs = load_lm_dataset(data_folder + '/wiki-train.txt')
S_train = docs_to_indices(docs, word_to_num, 1, 1)
X_train, D_train = seqs_to_lmXY(S_train)
# Load the dev set (for tuning hyperparameters)
docs = load_lm_dataset(data_folder + '/wiki-dev.txt')
S_dev = docs_to_indices(docs, word_to_num, 1, 1)
X_dev, D_dev = seqs_to_lmXY(S_dev)
# Load the test data
docs = load_lm_dataset(data_folder + '/wiki-test.txt')
S_test = docs_to_indices(docs, word_to_num, 1, 1)
X_test, D_test = seqs_to_lmXY(S_test)
X_train = X_train[:train_size]
D_train = D_train[:train_size]
X_dev = X_dev[:dev_size]
D_dev = D_dev[:dev_size]
# q = best unigram frequency from omitted vocab
# this is the best expected loss out of that set
q = vocab.freq[vocab_size] / sum(vocab.freq[vocab_size:])
# iterate all the combinations of hyperparameters, then write their results in file.
# learning_rate = [1, 0.5, 0.1, 0.05]
# hidden_unit = [25, 50]
# steps = [0, 2, 5]
# for i in learning_rate:
# for j in hidden_unit:
# for k in steps:
# model = RNN(vocab_size, j, vocab_size)
# best_loss = model.train(X_train, D_train, X_dev, D_dev, epochs=20, anneal=20, learning_rate=i, back_steps=k)
model = RNN(vocab_size, hdim, vocab_size)
best_loss, loss = model.train(X_train, D_train, X_dev, D_dev, X_test, D_test, learning_rate=lr, back_steps=lookback)
run_loss = adjust_loss(loss, fraction_lost, q, mode='basic')
adjusted_loss = adjust_loss(loss, fraction_lost, q, mode='adjusted')
print(np.exp(run_loss), np.exp(adjusted_loss))
with open('/afs/inf.ed.ac.uk/user/s18/s1853165/nlu-coursework/code/lr={}hidden={}step={}.txt'.format(
lr, hdim, lookback), 'a') as fr:
fr.write('\nmean_loss of test set: {:.4f}' .format(loss))
fr.write("\nUnadjusted: %.03f" % np.exp(run_loss))
fr.write("\nAdjusted for missing vocab: %.03f" % np.exp(adjusted_loss))
fr.close()
if mode == "train-np":
'''
starter code for parameter estimation.
'''
train_size = 25000
dev_size = 1000
vocab_size = 2000
# get the data set vocabulary
vocab = pd.read_table(data_folder + "/vocab.wiki.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], )
num_to_word = dict(enumerate(vocab.index[:vocab_size]))
word_to_num = invert_dict(num_to_word)
# calculate loss vocabulary words due to vocab_size
fraction_lost = fraq_loss(vocab, word_to_num, vocab_size)
print("Retained %d words from %d (%.02f%% of all tokens)\n" % (vocab_size, len(vocab), 100*(1-fraction_lost)))
# load training data
sents = load_np_dataset(data_folder + '/wiki-train.txt')
S_train = docs_to_indices(sents, word_to_num, 0, 0)
X_train, D_train = seqs_to_npXY(S_train)
X_train = X_train[:train_size]
Y_train = D_train[:train_size]
# load development data
sents = load_np_dataset(data_folder + '/wiki-dev.txt')
S_dev = docs_to_indices(sents, word_to_num, 0, 0)
X_dev, D_dev = seqs_to_npXY(S_dev)
X_dev = X_dev[:dev_size]
D_dev = D_dev[:dev_size]
sents = load_np_dataset(data_folder + '/wiki-test.txt')
S_test = docs_to_indices(sents, word_to_num, 0, 0)
X_test, D_test = seqs_to_npXY(S_test)
model = RNN(vocab_size, hdim, vocab_size)
loss, acc = model.train_np(X_train, Y_train, X_dev, D_dev, epochs=20, back_steps=lookback, learning_rate=lr)
print("Accuracy: %.03f" % acc)
if mode == "predict-lm":
rnn_folder = '/afs/inf.ed.ac.uk/user/s18/s1853165/nlu-coursework'
# get saved RNN matrices and setup RNN
U,V,W = np.load(rnn_folder + "/rnn.U.npy"), np.load(rnn_folder + "/rnn.V.npy"), np.load(rnn_folder + "/rnn.W.npy")
vocab_size = len(V[0])
hdim = len(U[0])
dev_size = 1000
r = RNN(vocab_size, hdim, vocab_size)
r.U = U
r.V = V
r.W = W
# get vocabulary
vocab = pd.read_table(data_folder + "/vocab.wiki.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], )
num_to_word = dict(enumerate(vocab.index[:vocab_size]))
word_to_num = invert_dict(num_to_word)
# Load the dev set (for tuning hyperparameters)
docs = load_lm_np_dataset(data_folder + '/wiki-dev.txt')
S_np_dev = docs_to_indices(docs, word_to_num, 1, 0)
X_np_dev, D_np_dev = seqs_to_lmnpXY(S_np_dev)
X_np_dev = X_np_dev[:dev_size]
D_np_dev = D_np_dev[:dev_size]
np_acc = r.compute_acc_lmnp(X_np_dev, D_np_dev)
print('Number prediction accuracy on dev set:', np_acc)
# load test data
sents = load_lm_np_dataset(data_folder + '/wiki-test.txt')
S_np_test = docs_to_indices(sents, word_to_num, 1, 0)
X_np_test, D_np_test = seqs_to_lmnpXY(S_np_test)
np_acc_test = r.compute_acc_lmnp(X_np_test, D_np_test)
print('Number prediction accuracy on test set:', np_acc_test)