-
Notifications
You must be signed in to change notification settings - Fork 51
/
index.html
2210 lines (1857 loc) · 113 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!doctype html>
<meta charset="utf-8">
<meta name="viewport" content="width=1080">
<script src="http://distill.pub/template.v1.js"></script>
<script type="text/front-matter">
title: Why Momentum Really Works
description: We often think of optimization with momentum as a ball rolling down a hill. This isn't wrong, but there is much more to the story.
authors:
- Gabriel Goh: http://gabgoh.github.io
affiliations:
- UC Davis: http://math.ucdavis.edu
</script>
<!-- Katex -->
<!--<script src="assets/lib/auto-render.min.js"></script>-->
<!--<script src="assets/lib/katex.min.js"></script>-->
<link rel="stylesheet" href="assets/lib/katex.min.css">
<link rel="stylesheet" type="text/css" href="assets/widgets.css">
<!-- Required -->
<script src="assets/lib/lib.js"></script>
<script src="assets/utils.js"></script>
<script>
var renderQueue = [];
function renderMath(elem) {
// renderMathInElement(
// elem,
// {
// delimiters: [
// {left: "$$", right: "$$", display: true},
// {left: "$", right: "$", display: false},
// ]
// }
// );
}
var deleteQueue = [];
function renderLoading(figure) {
var loadingScreen = figure.append("svg")
.style("width", figure.style("width"))
.style("height", figure.style("height"))
.style("position","absolute")
.style("top", "0px")
.style("left","0px")
.style("background","white")
.style("border", "0px dashed #DDD")
.style("opacity", 1)
return function(callback) { loadingScreen.remove() };
}
</script>
<div id="math-cache" style="display: none;">
<dt-math class="star">\star</dt-math>
<dt-math class="plus">+</dt-math>
<dt-math class="minus">-</dt-math>
<dt-math class="equals">=</dt-math>
<dt-math class="alpha">\alpha</dt-math>
<dt-math class="lambda">\lambda</dt-math>
<dt-math class="beta">\beta</dt-math>
<dt-math class="r">R</dt-math>
<dt-math class="alpha-equals">\alpha=</dt-math>
<dt-math class="beta-equals">\beta=</dt-math>
<dt-math class="beta-equals-zero">\beta = 0</dt-math>
<dt-math class="beta-equals-one">\beta=1</dt-math>
<dt-math class="alpha-equals-one-over-lambda-i">\alpha = 1/\lambda_i</dt-math>
<dt-math class="model">\text{model}</dt-math>
<dt-math class="p">0 p_1</dt-math>
<dt-math class="phat">0 \bar{p}_1</dt-math>
<dt-math class="two-sqrt-beta">2\sqrt{\beta}</dt-math>
<dt-math class="lambda-i">\lambda_i</dt-math>
<dt-math class="lambda-i-equals-zero">\lambda_i = 0</dt-math>
<dt-math class="alpha-gt-one-over-lambda-i">\alpha > 1/\lambda_i</dt-math>
<dt-math class="max-sigma-one">\max\{|\sigma_1|,|\sigma_2|\} > 1</dt-math>
<dt-math class="x-i-k">x_i^k - x_i^*</dt-math>
<dt-math class="xi-i">\xi_i</dt-math>
<dt-math class="beta-equals-one-minus">\beta = (1 - \sqrt{\alpha \lambda_i})^2</dt-math>
</div>
<script>
function MathCache(id) {
return document.querySelector("#math-cache ." + id).innerHTML;
}
</script>
<svg style="display: none;">
<g id="pointerThingy">
<circle fill="none" stroke="#FF6C00" stroke-linecap="round" cx="0" cy="0" r="14"/>
<circle fill="#FF6C00" cx="0" cy="0" r="11"/>
<path id="XMLID_173_" fill="#FFFFFF" d="M-3.2-1.3c0-0.1,0-0.2,0-0.3c0-0.1,0-0.2,0-0.3c-0.6,0-1.2,0-1.8,0c0,0.6,0,1.2,0,1.8
c0.2,0,0.4,0,0.6,0c0-0.4,0-0.8,0-1.2c0,0,0.1,0,0.1,0c0.3,0,0.5,0,0.8,0C-3.4-1.3-3.3-1.3-3.2-1.3c0,0.2,0,0.4,0,0.6
c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0,0,0,0-0.1c0-1.6,0-3.2,0-4.8c0-0.6,0-1.2,0-1.8c0,0,0,0,0.1,0
c0.3,0,0.7,0,1,0c0.1,0,0.1,0,0.2,0c0-0.2,0-0.4,0-0.6c-0.4,0-0.8,0-1.2,0C-2-7.2-2-7-2-6.8c0,0,0,0-0.1,0c-0.2,0-0.3,0-0.5,0
c0,0,0,0-0.1,0c0,1.8,0,3.6,0,5.5c-0.2,0-0.3,0-0.4,0C-3.1-1.3-3.2-1.3-3.2-1.3z M1.1-3.7C1-3.8,1-3.8,1.1-3.7C1-4,1-4.1,1-4.3
c0,0,0,0,0-0.1c-0.4,0-0.8,0-1.2,0c0-0.8,0-1.6,0-2.4c-0.2,0-0.4,0-0.6,0c0,1.8,0,3.6,0,5.5c0.2,0,0.4,0,0.6,0c0-0.8,0-1.6,0-2.4
c0,0,0.1,0,0.1,0C0.3-3.7,0.6-3.7,1.1-3.7C1-3.7,1-3.7,1.1-3.7C1.1-3.7,1-3.7,1.1-3.7c0,0.8,0,1.6,0,2.3c0,0,0,0.1,0,0.1
c0.2,0,0.4,0,0.6,0c0-0.6,0-1.2,0-1.8c0.4,0,0.8,0,1.2,0c0,0.8,0,1.6,0,2.4c0.2,0,0.4,0,0.6,0c0-0.6,0-1.2,0-1.8c0.2,0,0.4,0,0.6,0
c0,0,0,0,0,0.1c0,0.1,0,0.3,0,0.4c0,0,0,0.1,0,0.1c0.2,0,0.4,0,0.5,0c0,0,0.1,0,0.1,0.1c0,0.2,0,0.5,0,0.7c0,1.1,0,2.3,0,3.4
c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0c0,0,0,0,0,0c0,0.6,0,1.1,0,1.7c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0c0,0.4,0,0.8,0,1.2
c-1.6,0-3.2,0-4.9,0c0-0.4,0-0.8,0-1.2c-0.2,0-0.4,0-0.6,0C-2,3.8-2,3.4-2,3c-0.2,0-0.4,0-0.6,0c0,0.4,0,0.8,0,1.2
c0.2,0,0.4,0,0.6,0C-2,4.8-2,5.4-2,6c2,0,4.1,0,6.1,0c0-0.1,0-0.2,0-0.3c0-0.5,0-0.9,0-1.4c0-0.1,0-0.1,0-0.2c0.2,0,0.4,0,0.5,0
c0.1,0,0.1,0,0.1-0.1c0-0.4,0-0.9,0-1.3c0-0.1,0-0.3,0-0.4c0.1,0,0.2,0,0.3,0c0.1,0,0.2,0,0.3,0c0-1.4,0-2.8,0-4.3
c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c-0.4,0-0.8,0-1.2,0c0-0.2,0-0.4,0-0.6
c-0.1,0-0.2,0-0.3,0c-0.4,0-0.9,0-1.3,0C1.2-3.7,1.1-3.7,1.1-3.7z M-3.2,1.8c0,0.4,0,0.8,0,1.2c0.2,0,0.4,0,0.5,0
c0.1,0,0.1,0,0.1-0.1c0-0.3,0-0.6,0-1c0-0.1,0-0.1,0-0.2C-2.8,1.8-3,1.8-3.2,1.8c0-0.4,0-0.8,0-1.2c-0.2,0-0.4,0-0.6,0
c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0,0,0,0,0.1c0,0.1,0,0.3,0,0.4c0,0.2,0,0.5,0,0.7
c0,0,0,0.1,0.1,0.1c0.1,0,0.2,0,0.3,0C-3.4,1.8-3.3,1.8-3.2,1.8z"/>
<path id="XMLID_172_" fill="#FFFFFF" d="M4.1,4.2C4.1,4.2,4.1,4.2,4.1,4.2c0-0.6,0-1.2,0-1.8c0,0,0,0,0,0c0.2,0,0.4,0,0.6,0
c0,0,0-0.1,0-0.1c0-1.1,0-2.3,0-3.4c0-0.2,0-0.5,0-0.7c0,0,0-0.1-0.1-0.1c-0.2,0-0.4,0-0.5,0c0,0,0-0.1,0-0.1c0-0.1,0-0.3,0-0.4
c0,0,0-0.1,0-0.1c-0.2,0-0.4,0-0.6,0c0,0.6,0,1.2,0,1.8c-0.2,0-0.4,0-0.6,0c0-0.8,0-1.6,0-2.4c-0.4,0-0.8,0-1.2,0
c0,0.6,0,1.2,0,1.8c-0.2,0-0.4,0-0.6,0c0,0,0-0.1,0-0.1c0-0.7,0-1.5,0-2.2c0,0,0-0.1,0-0.1l0,0c0.1,0,0.2,0,0.2,0
c0.4,0,0.9,0,1.3,0c0.1,0,0.2,0,0.3,0c0,0.2,0,0.4,0,0.6c0.4,0,0.8,0,1.2,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6
c0.2,0,0.4,0,0.6,0c0,1.4,0,2.8,0,4.3c-0.1,0-0.2,0-0.3,0c-0.1,0-0.2,0-0.3,0c0,0.1,0,0.3,0,0.4c0,0.4,0,0.9,0,1.3
c0,0.1,0,0.1-0.1,0.1C4.5,4.2,4.3,4.2,4.1,4.2L4.1,4.2z"/>
<path id="XMLID_171_" fill="#FFFFFF" d="M4.1,4.2c0,0.1,0,0.1,0,0.2c0,0.5,0,0.9,0,1.4c0,0.1,0,0.2,0,0.3C2.1,6,0,6-2,6
c0-0.6,0-1.2,0-1.8c-0.2,0-0.4,0-0.6,0c0-0.4,0-0.8,0-1.2C-2.4,3-2.2,3-2,3c0,0.4,0,0.8,0,1.2c0.2,0,0.4,0,0.6,0c0,0.4,0,0.8,0,1.2
c1.6,0,3.2,0,4.9,0c0-0.4,0-0.8,0-1.2C3.7,4.2,3.9,4.2,4.1,4.2L4.1,4.2z"/>
<path id="XMLID_170_" fill="#FFFFFF" d="M-2-6.8c0,0.6,0,1.2,0,1.8c0,1.6,0,3.2,0,4.8c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0
c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6l0,0c0.1,0,0.1,0,0.2,0c0.1,0,0.3,0,0.4,0c0-1.8,0-3.6,0-5.5
c0,0,0.1,0,0.1,0C-2.4-6.8-2.2-6.8-2-6.8C-2.1-6.8-2-6.8-2-6.8L-2-6.8z"/>
<path id="XMLID_169_" fill="#FFFFFF" d="M1.1-3.7C1-3.7,1-3.7,1.1-3.7c-0.4,0-0.8,0-1.2,0c0,0,0,0-0.1,0c0,0.8,0,1.6,0,2.4
c-0.2,0-0.4,0-0.6,0c0-1.8,0-3.6,0-5.5c0.2,0,0.4,0,0.6,0c0,0.8,0,1.6,0,2.4c0.4,0,0.8,0,1.2,0c0,0,0,0.1,0,0.1C1-4.1,1-4,1.1-3.7
C1-3.8,1-3.8,1.1-3.7L1.1-3.7z"/>
<path id="XMLID_168_" fill="#FFFFFF" d="M-3.2,1.8c-0.1,0-0.2,0-0.3,0c-0.1,0-0.2,0-0.3,0c0,0-0.1,0-0.1-0.1c0-0.2,0-0.5,0-0.7
c0-0.1,0-0.3,0-0.4c0,0,0,0,0-0.1c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0
C-3.2,0.9-3.2,1.3-3.2,1.8c0.2,0,0.4,0,0.6,0c0,0.1,0,0.1,0,0.2c0,0.3,0,0.6,0,1C-2.6,3-2.7,3-2.7,3c-0.2,0-0.3,0-0.5,0
C-3.2,2.6-3.2,2.2-3.2,1.8z"/>
<path id="XMLID_167_" fill="#FFFFFF" d="M-3.2-1.3c-0.1,0-0.2,0-0.3,0c-0.3,0-0.5,0-0.8,0c0,0,0,0-0.1,0c0,0.4,0,0.8,0,1.2
c-0.2,0-0.4,0-0.6,0c0-0.6,0-1.2,0-1.8c0.6,0,1.2,0,1.8,0c0,0.1,0,0.2,0,0.3C-3.2-1.5-3.2-1.4-3.2-1.3L-3.2-1.3z"/>
<path id="XMLID_166_" fill="#FFFFFF" d="M-2-6.8C-2-7-2-7.2-2-7.4c0.4,0,0.8,0,1.2,0c0,0.2,0,0.4,0,0.6c-0.1,0-0.1,0-0.2,0
C-1.3-6.8-1.6-6.8-2-6.8C-2-6.8-2-6.8-2-6.8L-2-6.8z"/>
</g>
</svg>
<dt-article class="centered">
<h1>Why Momentum Really Works</h1>
<figure style = "position:relative; width:984px; height:400px;">
<div id="banana" style="position:relative; border: 1px solid rgba(0, 0, 0, 0.2);"></div>
<div id="sliderAlpha" style="position:absolute; width:300px; height: 50px; left:20px; top: 320px;">
<text class="figtext" style="top: -5px; left: 20px; position: relative;">Step-size α = 0.02</text>
</div>
<div id="sliderBeta" style="position:absolute; width: 300px; height: 50px; left: 280px; top: 320px;;">
<text class="figtext" style="top: -5px; left: 20px; position: relative;">Momentum β = 0.99</text>
</div>
<figcaption id="Bananacaption" style="position:absolute; width: 420px; height: 90px; left: 540px; top: 320px;">
We often think of Momentum as a means of dampening oscillations and speeding up the iterations, leading to faster convergence. But it has other interesting behavior. It allows a larger range of step-sizes to be used, and creates its own oscillations. What is going on?
</figcaption>
</figure>
<dt-byline class="l-page"></dt-byline>
<script src="assets/lib/contour_plot.js"></script>
<script src="assets/iterates.js"></script>
<script>
// Render Foreground
var iterControl = genIterDiagram(bananaf, [1,1/3], [[-2,2],[2/3 + 0.4,-2/3 + 0.4]])
.alpha(0.003)
.beta(0)
(d3.select("#banana").style("position","relative"))
var iterChange = iterControl.control
var getw0 = iterControl.w0
var StepRange = d3.scaleLinear().domain([0,100]).range([0,0.0062])
var MomentumRange = d3.scaleLinear().domain([0,100]).range([0,0.98])
var update = function (i,j) { iterChange(i, 0, getw0()) }
var slidera = sliderGen([230, 40])
.ticks([0,0.003,0.006])
.ticktitles( function(d,i) { return ["0", "0.003", "0.006"][i]})
.change( function (i) {
d3.select("#sliderAlpha").selectAll(".figtext").html("Step-size α = " + getalpha().toPrecision(2) )
iterChange(getalpha(), getbeta(), getw0() )
} )
.startxval(0.003)
.cRadius(7)
.shifty(-12)
.margins(20,20)
var sliderb = sliderGen([230, 40])
.ticks([0,0.5,0.99])
.change( function (i) {
d3.select("#sliderBeta").selectAll(".figtext").html("Momentum β = " + getbeta().toPrecision(2) )
iterChange(getalpha(), getbeta(), getw0() )
} )
.cRadius(7)
.shifty(-12)
.startxval(0.74)
.margins(20,20)
var getalpha = slidera( d3.select("#sliderAlpha")).xval
var getbeta = sliderb( d3.select("#sliderBeta")).xval
iterChange(getalpha(), getbeta(), getw0() )
</script>
<p>
<!-- The Momentum algorithm <dt-cite key="sutskever2013importance,polyak1964some,rutishauser1959theory"></dt-cite>, also known as the heavy ball method, can sometimes seem inseparable from the physics which has inspired it. It is common to think of gradient descent as a man walking down a hill, and momentum is a heavy ball rolling down it. Momentum grants iterates inertial energy, dampening oscillations between steep valleys, reinforcing directions which vary smoothly, allowing it to plough through narrow valleys, small humps, ridges and local minima.
The Momentum algorithm has a very natural physical interpretation. Where gradient descent walks down a hill along the steepest path, momentum is a heavy ball rolling down, ploughing through narrow valleys, small humps, ridges and local minima.
</p>
<p>
This story, now a staple of any exposition of the subject, is unfortunately a cartoon. And though it isn't wrong, it is an oversimplification, failing to account for many of momentum's important properties. It does not explain, for example, how much momentum should be used. Momentum works by overshooting the target, and relies on the gradient's corrective forces to set it back on track. But are these wasted iterations really worth it? And if so, how much of a speedup can one expect? Momentum also has the effect of increasing the range of permissible step-sizes. But what are the limits of this expansion, and why does it happen?
</p> -->
<p>
Here’s a popular story about momentum <dt-cite key="sutskever2013importance,polyak1964some,rutishauser1959theory"></dt-cite>: gradient descent is a man walking down a hill. He follows the steepest path downwards; his progress is slow, but steady. Momentum is a heavy ball rolling down the same hill. The added inertia acts both as a smoother and an accelerator, dampening oscillations and causing us to barrel through narrow valleys, small humps and local minima.
</p>
<p>
This standard story isn’t wrong, but it fails to explain many important behaviors of momentum. In fact, momentum can be understood far more precisely if we study it on the right model.
</p>
<p>
One nice model is the convex quadratic. This model is rich enough to reproduce momentum’s local dynamics in real problems, and yet simple enough to be understood in closed form. This balance gives us powerful traction for understanding this algorithm.
</p>
<hr>
<p>
We begin with gradient descent. The algorithm has many virtues, but speed is not one of them. It is simple -- when optimizing a smooth function <dt-math>f</dt-math>, we make a small step in the gradient
<dt-math block>w^{k+1} = w^k-\alpha\nabla f(w^k).</dt-math>
For a step-size small enough, gradient descent makes a monotonic improvement at every iteration. It always converges, albeit to a local minimum. And under a few weak curvature conditions it can even get there at an exponential rate.
</p>
<p>
But the exponential decrease, though appealing in theory, can often be infuriatingly small. Things often begin quite well -- with an impressive, almost immediate decrease in the loss. But as the iterations progress, things start to slow down. You start to get a nagging feeling you're not making as much progress as you should be. What has gone wrong?
</p>
<p>
The problem could be the optimizer's old nemesis, pathological curvature. Pathological curvature is, simply put, regions of <dt-math>f</dt-math> which aren't scaled properly. The landscapes are often described as valleys, trenches, canals and ravines. The iterates either jump between valleys, or approach the optimum in small, timid steps. Progress along certain directions grind to a halt. In these unfortunate regions, gradient descent fumbles.</p>
<p>
Momentum proposes the following tweak to gradient descent. We give gradient descent a short-term memory:
<dt-math block>
\begin{aligned}
z^{k+1}&=\beta z^{k}+\nabla f(w^{k})\\[0.4em]
w^{k+1}&=w^{k}-\alpha z^{k+1}
\end{aligned}
</dt-math>
The change is innocent, and costs almost nothing. When <dt-math>\beta = 0</dt-math> , we recover gradient descent. But for <dt-math>\beta = 0.99</dt-math> (sometimes <dt-math>0.999</dt-math>, if things are really bad), this appears to be the boost we need. Our iterations regain that speed and boldness it lost, speeding to the optimum with a renewed energy.
</p>
<p>
Optimizers call this minor miracle "acceleration".
</p>
<p>
The new algorithm may seem at first glance like a cheap hack. A simple trick to get around gradient descent's more aberrant behavior -- a smoother for oscillations between steep canyons. But the truth, if anything, is the other way round. It is gradient descent which is the hack. First, momentum gives up to a quadratic speedup on many functions. <dt-fn> It is possible, however, to construct very specific counterexamples where momentum does not converge, even on convex functions. See <dt-cite key="lessard2016analysis"></dt-cite> for a counterexample. </dt-fn> This is no small matter -- this is similar to the speedup you get from the Fast Fourier Transform, Quicksort, and Grover's Algorithm. When the universe gives you quadratic speedups, you should start to pay attention.
</p>
<p>
But there's more. A lower bound, courtesy of Nesterov <dt-cite key="nesterov2013introductory"></dt-cite>, states that momentum is, in a certain very narrow and technical sense, optimal. Now, this doesn't mean it is the best algorithm for all functions in all circumstances. But it does satisfy some curiously beautiful mathematical properties which scratch a very human itch for perfection and closure. But more on that later. Let's say this for now -- momentum is an algorithm for the book.
</p>
<hr>
<h2>First Steps: Gradient Descent</h2>
<p>
We begin by studying gradient descent on the simplest model possible which isn't trivial -- the convex quadratic,
<dt-math block>
f(w) = \tfrac{1}{2}w^TAw - b^Tw, \qquad w \in \mathbf{R}^n.
</dt-math>
Assume <dt-math>A</dt-math> is symmetric and invertible, then the optimal solution <dt-math>w^{\star}</dt-math> occurs at
<dt-math block> w^{\star} = A^{-1}b.</dt-math>
Simple as this model may be, it is rich enough to approximate many functions (think of <dt-math>A</dt-math> as your favorite model of curvature -- the Hessian, Fisher Information Matrix <dt-cite key="amari1998natural"></dt-cite>, etc) and captures all the key features of pathological curvature. And more importantly, we can write an exact closed formula for gradient descent on this function.
</p>
<p>
This is how it goes. Since <dt-math>\nabla f(w)=Aw - b</dt-math>, the iterates are
<dt-math block>
w^{k+1}=w^{k}- \alpha (Aw^{k} - b).
</dt-math>
Here's the trick. There is a very natural space to view gradient descent where all the dimensions act independently -- the eigenvectors of <dt-math>A</dt-math>.
</p>
<figure style = "width:750px; height:340px; display:block; margin-left:auto; margin-right:auto; position:relative" id = "change_of_variables">
<div id = "mom1" style="width:400px; position:absolute; left:0px; top:0px"></div>
<div id = "mom2" style="width:400px; position:absolute; left:400px; top:0px"></div>
</svg>
</figure>
<script>
deleteQueue.push(renderLoading(d3.select("#change_of_variables")))
renderQueue.push(function(callback) {
var U = givens(Math.PI/4)
var Ut = numeric.transpose(U)
// Render Foreground
var left = d3.select("#mom1").style("border", "1px solid rgba(0, 0, 0, 0.2)")
var c1 = genIterDiagram(quadf, [0,0], [[-3,3],[-3,3]])
.width(340)
.height(340)
.iters(300)
.alpha(0.018)
.showSolution(false)
.pathWidth(1)
.circleRadius(1.5)
.pointerScale(0.8)
.showStartingPoint(false)
.drag(function() {
c2.control(c1.alpha(),
c1.beta(),
numeric.dot(U,c1.w0())) })
(left)
var right = d3.select("#mom2").style("border", "1px solid rgba(0, 0, 0, 0.2)")
var c2 = genIterDiagram(eyef, [0,0], [[-3,3],[-3,3]])
.width(340)
.height(340)
.iters(300)
.alpha(0.018)
.showSolution(false)
.pathWidth(1)
.circleRadius(1.5)
.pointerScale(0.8)
.showStartingPoint(false)
.drag(function() {
c1.control(c2.alpha(),
c2.beta(),
numeric.dot(Ut,c2.w0())) })
(right)
// Initialize
c2.control(0.018,0,[-2.5,1])
c1.control(0.018,0,numeric.dot(Ut,[-2.5,1]));
callback(null);
});
</script>
<p>
Every symmetric matrix <dt-math>A</dt-math> has an eigenvalue decomposition
<dt-math block>
A=Q\ \text{diag}(\lambda_{1},\ldots,\lambda_{n})\ Q^{T},\qquad Q = [q_1,\ldots,q_n],
</dt-math>
and, as per convention, we will assume that the <dt-math>\lambda_i</dt-math>'s are sorted, from smallest <dt-math>\lambda_1</dt-math> to biggest <dt-math>\lambda_n</dt-math>. If we perform a change of basis, <dt-math>x^{k} = Q^T(w^{k} - w^\star)</dt-math>, the iterations break apart, becoming:
<dt-math block>
\begin{aligned}
x_{i}^{k+1} & =x_{i}^{k}-\alpha \lambda_ix_{i}^{k} \\[0.4em]
&= (1-\alpha\lambda_i)x^k_i=(1-\alpha \lambda_i)^{k+1}x^0_i
\end{aligned}
</dt-math>
Moving back to our original space <dt-math>w</dt-math>, we can see that
<dt-math block>
w^k - w^\star = Qx^k=\sum_i^n x^0_i(1-\alpha\lambda_i)^k q_i
</dt-math>
and there we have it -- gradient descent in closed form.
</p>
</p>
<h3>Decomposing the Error</h3>
<p>
The above equation admits a simple interpretation. Each element of <dt-math>x^0</dt-math> is the component of the error in the initial guess in the <dt-math>Q</dt-math>-basis. There are <dt-math>n</dt-math> such errors, and each of these errors follows its own, solitary path to the minimum, decreasing exponentially with a compounding rate of <dt-math>1-\alpha\lambda_i</dt-math>. The closer that number is to <dt-math>1</dt-math>, the slower it converges.
</p>
<p>
For most step-sizes, the eigenvectors with largest eigenvalues converge the fastest. This triggers an explosion of progress in the first few iterations, before things slow down as the smaller eigenvectors' struggles are revealed. By writing the contributions of each eigenspace's error to the loss
<dt-math block>
f(w^{k})-f(w^{\star})=\sum(1-\alpha\lambda_{i})^{2k}\lambda_{i}[x_{i}^{0}]^2
</dt-math>
we can visualize the contributions of each error component to the loss.
</p>
<figure style="position:relative; width:920px; height:360px" id = "milestones_gd">
<figcaption style="position:absolute; text-align:left; left:135px; width:350px; height:80px">Optimization can be seen as combination of several component problems, shown here as <svg style="position:relative; top:2px; width:3px; height:14px; background:#fde0dd"></svg> 1 <svg style="position:relative; top:2px; width:3px; height:14px; background:#fa9fb5"></svg> 2 <svg style="position:relative; top:2px; width:3px; height:14px; background:#c51b8a"></svg> 3 with eigenvalues <svg style="position:relative; top:2px; width:3px; height:14px; background:#fde0dd"></svg> <dt-math>\lambda_1=0.01</dt-math>, <svg style="position:relative; top:2px; width:3px; height:14px; background:#fa9fb5"></svg> <dt-math>\lambda_2=0.1</dt-math>, and <svg style="position:relative; top:2px; width:3px; height:14px; background:#c51b8a"></svg> <dt-math>\lambda_3=1</dt-math> respectively. </figcaption>
<!-- ["#fde0dd", "#fa9fb5", "#c51b8a"]
-->
<div id = "sliderStep" style="position:absolute; left:550px; width:250px; height:100px">
<div id="stepSizeMilestones"
class="figtext"
style="position:absolute; left:15px; top:15px">
Step-size
</div>
<div class="figtext2" id="milestones_gd_optstep"
style="position:absolute; font-size:11px; left:152px; top:18px; z-index:10; cursor: pointer">
Optimal Step-size
</div>
<svg style="position:absolute; font-size:10px; left:224px; top:34px">
<line marker-end="url(#arrowhead)" style="stroke: black; stroke-width: 1.5; visibility: visible;" x2="5" y2="10" x1="5" y1="0"></line>
</svg>
</div>
<div id = "obj"></div>
</figure>
<script src="assets/milestones.js"></script>
<script>
deleteQueue.push(renderLoading(d3.select("#milestones_gd")))
renderQueue.push(function(callback) {
var graphDiv = d3.select("#obj")
.style("width", 920 + "px")
.style("height", 300 + "px")
.style("top", "90px")
.style("position", "relative")
.style("margin-left", "auto")
.style("margin-right", "auto")
.attr("width", 920)
.attr("height", 500)
var svg = graphDiv.append("svg")
.attr("width", 920)
.attr("height", 300)
.style("position","absolute")
.style("left", "15px")
var updateSliderGD = renderMilestones(svg, function() {});
var alphaHTML = MathCache("alpha-equals");
var slidera = sliderGen([250, 80])
.ticks([0,1,200/(101),2])
.change( function (i) {
var html = alphaHTML + '<span style="font-weight: normal;">' + i.toPrecision(4) + "</span>";
d3.select("#stepSizeMilestones")
.html("Stepsize " + html )
updateSliderGD(i,0.000)
} )
.ticktitles(function(d,i) { return [0,1,"",2][i] })
.startxval(200/(101))
.cRadius(7)
.shifty(-12)
.shifty(10)
.margins(20,20)(d3.select("#sliderStep"))
// renderDraggable(svg, [133.5, 23], [114.5, 90], 2, " ").attr("opacity", 0.1)
// renderDraggable(svg, [133.5, 88], [115.5, 95], 2, " ").attr("opacity", 0.1)
// renderDraggable(svg, [132.5, 154], [114.5, 100], 2, " ").attr("opacity", 0.1)
d3.select("#milestones_gd_optstep").on("click", slidera.init)
svg.append("text")
.attr("class", "katex morsd mathit")
.style("font-size", "19px")
.style("font-family","KaTeX_Math")
.attr("x", 105)
.attr("y", 50)
.attr("text-anchor", "end")
.attr("fill", "gray")
.html("f(w<tspan baseline-shift = \"super\" font-size = \"15\">k</tspan>) - f(w<tspan baseline-shift = \"super\" font-size = \"15\">*</tspan>)")
svg.append("text")
.style("font-size", "13px")
.attr("x", 0)
.attr("y", 80)
.attr("dy", 0)
.attr("transform", "translate(110,0)")
.attr("class", "caption")
.attr("text-anchor", "end")
.attr("fill", "gray")
.text("At the initial point, the error in each component is equal.")
svg.selectAll(".caption").call(wrap, 100)
svg.append("text")
.style("font-size", "13px")
.attr("x", 420)
.attr("y", 270)
.attr("dy", 0)
.attr("dx", -295)
.attr("text-anchor", "start")
.attr("fill", "gray")
.text("At the optimum, the rates of convergence of the largest and smallest eigenvalues equalize.")
callback(null);
});
</script>
<p>
<h3>Choosing A Step-size</h3>
<p>
The above analysis gives us immediate guidance as to how to set a step-size <dt-math>\alpha</dt-math>. In order to converge, each <dt-math>|1-\alpha \lambda_i|</dt-math> must be strictly less than 1. All workable step-sizes, therefore, fall in the interval
<dt-math block>0<\alpha\lambda_i<2.</dt-math>
The overall convergence rate is determined by the slowest error component, which must be either <dt-math>\lambda_1</dt-math> or <dt-math>\lambda_n</dt-math>:
<dt-math block>
\begin{aligned}\text{rate}(\alpha) & ~=~ \max_{i}\left|1-\alpha\lambda_{i}\right|\\[0.9em] & ~=~ \max\left\{|1-\alpha\lambda_{1}|,~ |1-\alpha\lambda_{n}|\right\} \end{aligned}
</dt-math>
</p>
<p>
This overall rate is minimized when the rates for <dt-math>\lambda_1</dt-math> and <dt-math>\lambda_n</dt-math> are the same -- this mirrors our informal observation in the previous section that the optimal step-size causes the first and last eigenvectors to converge at the same rate. If we work this through we get:
<dt-math block>
\begin{aligned}
\text{optimal }\alpha ~=~{\mathop{\text{argmin}}\limits_\alpha} ~\text{rate}(\alpha) & ~=~\frac{2}{\lambda_{1}+\lambda_{n}}\\[1.4em]
\text{optimal rate} ~=~{\min_\alpha} ~\text{rate}(\alpha) & ~=~\frac{\lambda_{n}/\lambda_{1}-1}{\lambda_{n}/\lambda_{1}+1}
\end{aligned}
</dt-math>
</p>
<p>
Notice the ratio <dt-math>\lambda_n/\lambda_1</dt-math> determines the convergence rate of the problem. In fact, this ratio appears often enough that we give it a name, and a symbol -- the condition number.
<dt-math block>
\text{condition number} := \kappa :=\frac{\lambda_n}{\lambda_1}
</dt-math>
The condition number means many things. It is a measure of how close to singular a matrix is. It is a measure of how robust <dt-math>A^{-1}b</dt-math> is to perturbations in <dt-math>b</dt-math>. And, in this context, the condition number gives us a measure of how poorly gradient descent will perform. A ratio of <dt-math>\kappa = 1</dt-math> is ideal, giving convergence in one step (of course, the function is trivial). Unfortunately the larger the ratio, the slower gradient descent will be. The condition number is therefore a direct measure of pathological curvature.
</p>
<hr>
<h2>Example: Polynomial Regression</h2>
<p>
The above analysis reveals an insight: all errors are not made equal. Indeed, there are different kinds of errors, <dt-math>n</dt-math> to be exact, one for each of the eigenvectors of <dt-math>A</dt-math>. And gradient descent is better at correcting some kinds of errors than others. But what do the eigenvectors of <dt-math>A</dt-math> mean? Surprisingly, in many applications they admit a very concrete interpretation.
</p>
<p>
Lets see how this plays out in polynomial regression. Given 1D data, <dt-math>\xi_i</dt-math>, our problem is to fit the model
<dt-math block>
\text{model}(\xi)=w_{1}p_{1}(\xi)+\cdots+w_{n}p_{n}(\xi)\qquad p_{i}=\xi\mapsto\xi^{i-1}
</dt-math>
to our observations, <dt-math>d_i</dt-math>. This model, though nonlinear in the input <dt-math>\xi</dt-math>, is linear in the weights, and therefore we can write the model as a linear combination of monomials, like:
</p>
<figure id = "poly0f" style="width:940px; height:200px">
<div id = "poly0" style="width:940px; height:185px; position:absolute; top:20px"></div>
</figure>
<script src="assets/eigensum.js"></script>
<script>
deleteQueue.push(renderLoading(d3.select("#poly0")))
renderQueue.push(function(callback) {
// Preprocess x, get eigendecomposition, etc
var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
var D = vandermonde(x, 5)
var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
var U = Eigs.U
var lambda = Eigs.lambda
// Preprocess y
var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]
var Dtb = numeric.dot(b,D)
var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
var step = 1.8/lambda[0]
var iter = geniter(U, lambda, Dtb, step)
var eigensum = d3.select("#poly0")
var wi = [-2,-2,2,2,2,-2]
function refit(b) {
var Dtb = numeric.dot(b,D)
var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
var Utsol = numeric.dot(sol,U)
eigenControl.updateweights(Utsol)
}
var eigenControl = renderEigenPanel(eigensum, numeric.identity(6), x, b, wi, refit);
// Swoopy Annotator
var annotations = [
{
"x": 0,
"y": 0,
"path": "M 60,5 A 19.018 19.018 0 0 0 36,27",
"text": "scrub values",
"textOffset": [
64,
9
]
}
]
drawAnnotations(d3.select("#poly0f"), annotations)
callback(null);
});
</script>
<p>
Because of the linearity, we can fit this model to our data <dt-math>\xi_i</dt-math> using linear regression on the model mismatch
<dt-math block>
\text{minimize}_w \qquad\tfrac{1}{2}\sum_i (\text{model}(\xi_{i})-d_{i})^{2} ~~=~~ \tfrac{1}{2}\|Zw - d\|^2
</dt-math>
where
<dt-math block>
Z=\left(\begin{array}{ccccc}
1 & \xi_{1} & \xi_{1}^{2} & \ldots & \xi_{1}^{n-1}\\
1 & \xi_{2} & \xi_{2}^{2} & \ldots & \xi_{2}^{n-1}\\
\vdots & \vdots & \vdots & \ddots & \vdots\\
1 & \xi_{m} & \xi_{m}^{2} & \ldots & \xi_{m}^{n-1}
\end{array}\right).
</dt-math>
</p>
<p>
The path of convergence, as we know, is elucidated when we view the iterates in the space of <dt-math>Q</dt-math> (the eigenvectors of <dt-math>Z^T Z</dt-math>). So let's recast our regression problem in the basis of <dt-math>Q</dt-math>. First, we do a change of basis, by rotating <dt-math>w</dt-math> into <dt-math>Qw</dt-math>, and counter-rotating our feature maps <dt-math>p</dt-math> into eigenspace, <dt-math>\bar{p}</dt-math>. We can now conceptualize the same regression as one over a different polynomial basis, with the model
<dt-math block>
\text{model}(\xi)~=~x_{1}\bar{p}_{1}(\xi)~+~\cdots~+~x_{n}\bar{p}_{n}(\xi)\qquad \bar{p}_{i}=\sum q_{ij}p_j.
</dt-math>
This model is identical to the old one. But these new features <dt-math>\bar{p}</dt-math> (which I call "eigenfeatures") and weights have the pleasing property that each coordinate acts independently of the others. Now our optimization problem breaks down, really, into <dt-math>n</dt-math> small 1D optimization problems. And each coordinate can be optimized greedily and independently, one at a time in any order, to produce the final, global, optimum. The eigenfeatures are also much more informative:
</p>
<figure id = "poly1" style="width:940px; height:285px"></figure>
<script>
deleteQueue.push(renderLoading(d3.select("#poly1")))
renderQueue.push(function(callback) {
var inv = function(lambda) { return 1/lambda }
var scal = function(lambda) { return lambda < 1e-10 ? -100 : 1.5/Math.sqrt(lambda) }
// Preprocess x, get eigendecomposition, etc
var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
var D = vandermonde(x, 5)
var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
var U = Eigs.U
var lambda = Eigs.lambda
// Preprocess y
var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]
var Dtb = numeric.dot(b,D)
var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
var step = 1.8/lambda[0]
var iter = geniter(U, lambda, Dtb, step)
var eigensum = d3.select("#poly1")
var wi = lambda.slice(0).map(scal)
function refit(b) {
var Dtb = numeric.dot(b,D)
var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
var Utsol = numeric.dot(sol,U)
eigenControl.updateweights(sol)
}
var eigenControl = renderEigenPanel(eigensum, U, x, b, wi, refit, true)
var annotate = eigensum
annotate.append("figcaption")
.style("width", 230 + "px")
.style("height", 150 + "px")
.style("left", "0px")
.style("position", "absolute")
.style("padding", "10px")
.html("The data comes in 2 clusters. The first 2 eigenfeatures capture variations between the clusters. ")
annotate.append("figcaption")
.style("width", 230 + "px")
.style("height", 150 + "px")
.style("left", "260px")
.style("position", "absolute")
.style("padding", "10px")
.html("Next there are smooth variations within clusters, peaks within clusters,")
annotate.append("figcaption")
.style("width", 230 + "px")
.style("height", 150 + "px")
.style("left", 530 + "px")
.style("position", "absolute")
.style("padding", "10px")
.html("and finally, jagged polynomials which differ wildly on neighboring points. ");
// Swoopy Annotator
var annotations = [
{
"x": 0,
"y": 0,
"path": "M 807,198 A 26.661 26.661 0 0 1 838,159",
"text": "drag points to fit data",
"textOffset": [
799,
214
]
}]
drawAnnotations(eigensum, annotations)
callback(null);
});
</script>
<p>
The observations in the above diagram can be justified mathematically. From a statistical point of view, we would like a model which is, in some sense, robust to noise. Our model cannot possibly be meaningful if the slightest perturbation to the observations changes the entire model dramatically. And the eigenfeatures, the principal components of the data, give us exactly the decomposition we need to sort the features by its sensitivity to perturbations in <dt-math>d_i</dt-math>'s. The most robust components appear in the front (with the largest eigenvalues), and the most sensitive components in the back (with the smallest eigenvalues).
</p>
<p>
This measure of robustness, by a rather convenient coincidence, is also a measure of how easily an eigenspace converges. And thus, the "pathological directions" -- the eigenspaces which converge the slowest -- are also those which are most sensitive to noise! So starting at a simple initial point like <dt-math>0</dt-math> (by a gross abuse of language, let's think of this as a prior), we track the iterates till a desired level of complexity is reached. Let's see how this plays out in gradient descent.
</p>
<figure id = "poly2" style="width:940px; height:360px"></figure>
<script>
deleteQueue.push(renderLoading(d3.select("#poly2")))
renderQueue.push(function(callback) {
var inv = function(lambda) { return 1/lambda }
var scal = function(lambda) { return lambda < 1e-10 ? -100 : 1.5/Math.sqrt(lambda) }
// Preprocess x, get eigendecomposition, etc
var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]
var D = vandermonde(x, 5)
var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
var U = Eigs.U
var lambda = Eigs.lambda
// Preprocess y
var Dtb = numeric.dot(b,D)
var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
var step = 1.8/lambda[0]
var iter = geniter(U, lambda, Dtb, step)
var eigensum = d3.select("#poly2")
var wi = lambda.slice(0).map(scal)
function refit(b) {
var Dtb = numeric.dot(b,D)
iter = geniter(U, lambda, Dtb, step)
onChange(sliderControl.slidera.xval())
}
var eigenControl = renderEigenPanel(eigensum, U, x, b, wi, refit, true)
var barlengths = getStepsConvergence(lambda, step).map(Math.log)
var onChange = function(i) {
eigenControl.updateweights(numeric.dot(U,iter(Math.floor(Math.exp(i-0.1)) )))
}
var sliderControl = sliderBarGen(barlengths, function(i) {return Math.exp(i-0.1)}).update(onChange)(d3.select("#poly2"))
d3.select("#poly2").append("figcaption")
.style("width", "120px")
.style("position", "absolute")
.style("left", "820px")
.style("top","200px")
.html("When an eigenspace has converged to three significant digits, the bar greys out. Drag the observations to change fit.")
sliderControl.slidera.init()
// var figwidth = d3.select("#poly2").style("width")
// var figheight = d3.select("#poly2").style("height")
// var svgannotate = d3.select("#poly2")
// .append("svg")
// .style("width", figwidth)
// .style("height", figheight)
// .style("position", "absolute")
// .style("top","0px")
// .style("left","0px")
// .style("pointer-events","none")
// renderDraggable(svgannotate,
// [139.88888549804688, 243.77951049804688],
// [121.88888549804688, 200.77951049804688],
// 5,
// "We begin at x=w=0");
// Swoopy Annotator
var annotations = [
{
"x": 0,
"y": 0,
"path": "M 74,202 A 52.274 52.274 0 0 0 134,245",
"text": "We begin at x=w=0",
"textOffset": [
21,
198
]
}
]
drawAnnotations(d3.select("#poly2"), annotations)
callback(null);
});
</script>
<p>
This effect is harnessed with the heuristic of early stopping : by stopping the optimization early, you can often get better generalizing results. Indeed, the effect of early stopping is very similar to that of more conventional methods of regularization, such as Tikhonov Regression. Both methods try to suppress the components of the smallest eigenvalues directly, though they employ different methods of spectral decay.<dt-fn>In Tikhonov Regression we add a quadratic penalty to the regression, minimizing
<dt-math block>
\text{minimize}\qquad\tfrac{1}{2}\|Zw-d\|^{2}+\frac{\eta}{2}\|w\|^{2}=\tfrac{1}{2}w^{T}(Z^{T}Z+\eta I)w-(Zd)^{T}w
</dt-math>
Recall that <dt-math>Z^{T}Z=Q\ \text{diag}(\Lambda_{1},\ldots,\Lambda_{n})\ Q^T</dt-math>. The solution to Tikhonov Regression is therefore
<dt-math block>
(Z^{T}Z+\eta I)^{-1}(Zd)=Q\ \text{diag}\left(\frac{1}{\lambda_{1}+\eta},\cdots,\frac{1}{\lambda_{n}+\eta}\right)Q^T(Zd)
</dt-math>
We can think of regularization as a function which decays the largest eigenvalues, as follows:
<dt-math block>
\text{Tikhonov Regularized } \lambda_i = \frac{1}{\lambda_{i}+\eta}=\frac{1}{\lambda_{i}}\left(1-\left(1+\lambda_{i}/\eta\right)^{-1}\right).
</dt-math>
Gradient descent can be seen as employing a similar decay, but with the decay rate
<dt-math block> \text{ Gradient Descent Regularized } \lambda_i = \frac{1}{\lambda_i} \left( 1-\left(1-\alpha\lambda_{i}\right)^{k} \right)</dt-math>
instead. Note that this decay is dependent on the step-size.
</dt-fn> But early stopping has a distinct advantage. Once the step-size is chosen, there are no regularization parameters to fiddle with. Indeed, in the course of a single optimization, we have the entire family of models, from underfitted to overfitted, at our disposal. This gift, it seems, doesn't come at a price. A beautiful free lunch <dt-cite key="hintonNIPS"></dt-cite> indeed.
</p>
<hr>
<h2>The Dynamics of Momentum</h2>
<p>
Let's turn our attention back to momentum. Recall that the momentum update is
<dt-math block>
\begin{aligned}
z^{k+1}&=\beta z^{k}+\nabla f(w^{k})\\[0.4em]
w^{k+1}&=w^{k}-\alpha z^{k+1}.
\end{aligned}
</dt-math>
Since <dt-math>\nabla f(w^k) = Aw^k - b</dt-math>, the update on the quadratic is
<dt-math block>
\begin{aligned}
z^{k+1}&=\beta z^{k}+ (Aw^{k}-b)\\[0.4em]
w^{k+1}&=w^{k}-\alpha z^{k+1}.
\end{aligned}
</dt-math>
Following <dt-cite key="o2015adaptive"></dt-cite>, we go through the same motions, with the change of basis <dt-math>
x^{k} = Q(w^{k} - w^\star)</dt-math> and <dt-math> y^{k} = Qz^{k}</dt-math>, to yield the update rule
<dt-math block>
\begin{aligned}
y_{i}^{k+1}&=\beta y_{i}^{k}+\lambda_{i}x_{i}^{k}\\[0.4em]
x_{i}^{k+1}&=x_{i}^{k}-\alpha y_{i}^{k+1}.
\end{aligned}
</dt-math>
in which each component acts independently of the other components (though <dt-math>x^k_i</dt-math> and <dt-math>y^k_i</dt-math> are coupled). This lets us rewrite our iterates as
<dt-fn>
This is true as we can write updates in matrix form as
<dt-math block>
\left(\!\!\begin{array}{cc}
1 & 0\\
\alpha & 1
\end{array}\!\!\right)\Bigg(\!\!\begin{array}{c}
y_{i}^{k+1}\\
x_{i}^{k+1}
\end{array}\!\!\Bigg)=\left(\!\!\begin{array}{cc}
\beta & \lambda_{i}\\
0 & 1
\end{array}\!\!\right)\left(\!\!\begin{array}{c}
y_{i}^{k}\\
x_{i}^{k}
\end{array}\!\!\right)
</dt-math>
which implies, by inverting the matrix on the left,
<dt-math block>
\Bigg(\!\!\begin{array}{c}
y_{i}^{k+1}\\
x_{i}^{k+1}
\end{array}\!\!\Bigg)=\left(\!\!\begin{array}{cc}
\beta & \lambda_{i}\\
-\alpha\beta & 1-\alpha\lambda_{i}
\end{array}\!\!\right)\left(\!\!\begin{array}{c}
y_{i}^{k}\\
x_{i}^{k}
\end{array}\!\!\right)=R^{k+1}\left(\!\!\begin{array}{c}
x_{i}^{0}\\
y_{i}^{0}
\end{array}\!\!\right)
</dt-math>
</dt-fn>
<dt-math block>
\left(\!\!\begin{array}{c}
y_{i}^{k}\\
x_{i}^{k}
\end{array}\!\!\right)=R^k\left(\!\!\begin{array}{c}
y_{i}^{0}\\
x_{i}^{0}
\end{array}\!\!\right)
\qquad
R = \left(\!\!\begin{array}{cc}
\beta & \lambda_{i}\\
-\alpha\beta & 1-\alpha\lambda_{i}
\end{array}\!\!\right).
</dt-math>
There are many ways of taking a matrix to the <dt-math>k^{th}</dt-math> power. But for the <dt-math>2 \times 2</dt-math> case there is an elegant and little known formula <dt-cite key="williamsnthpower"></dt-cite> in terms of the eigenvalues of <dt-math>R</dt-math>, <dt-math>\sigma_1</dt-math> and <dt-math>\sigma_2</dt-math>.
<dt-math block>
\color{#AAA}{\color{black}{R^{k}}=\begin{cases}
\color{black}{\sigma_{1}^{k}}R_{1}-\color{black}{\sigma_{2}^{k}}R_{2} & \sigma_{1}\neq\sigma_{2}\\
\sigma_{1}^{k}(kR/\sigma_1-(k-1)I) & \sigma_{1}=\sigma_{2}
\end{cases},\qquad R_{j}=\frac{R-\sigma_{j}I}{\sigma_{1}-\sigma_{2}}}
</dt-math>
This formula is rather complicated, but the takeaway here is that it plays the exact same role the individual convergence rates, <dt-math>1-\alpha\lambda_i</dt-math> do in gradient descent. But instead of one geometric series, we have two coupled series, which may have real or complex values. The convergence rate is therefore the slowest of the two rates, <dt-math>\max\{|\sigma_{1}|,|\sigma_{2}|\}</dt-math>
<dt-fn>
We can write out the convergence rates explicitly. The eigenvalues are
<dt-math block>
\begin{aligned}
\sigma_{1} & =\frac{1}{2}\left(1-\alpha\lambda+\beta+\sqrt{(-\alpha\lambda+\beta+1)^{2}-4\beta}\right)\\[0.6em]
\sigma_{2} & =\frac{1}{2}\left(1-\alpha\lambda+\beta-\sqrt{(-\alpha\lambda+\beta+1)^{2}-4\beta}\right)
\end{aligned}
</dt-math>
When the <dt-math>(-\alpha\lambda+\beta+1)^{2}-4\beta<0</dt-math> is less than zero,
then the roots are complex and the convergence rate is
<dt-math block>
\begin{aligned}
|\sigma_{1}|=|\sigma_{2}| & =\sqrt{(1-\alpha\lambda+\beta)^{2}+|(-\alpha\lambda+\beta+1)^{2}-4\beta|}=2\sqrt{\beta}
\end{aligned}
</dt-math>
Which is, surprisingly, independent of the step-size or the eigenvalue <dt-math>\alpha\lambda</dt-math>. When the roots are real, the convergence rate is
<dt-math block>
\max\{|\sigma_{1}|,|\sigma_{2}|\}=\tfrac{1}{2}\max\left\{ |1-\alpha\lambda_{i}+\beta\pm\sqrt{(1-\alpha\lambda_{i}+\beta)^{2}-4\beta}|\right\}
</dt-math>
</dt-fn>. By plotting this out, we see there are distinct regions of the parameter space which reveal a rich taxonomy of convergence behavior <dt-cite key="flammarion2015averaging"></dt-cite>:
</p>
<figure id = "momentum2D" style="width:984px; height:540px">
<div class = "l-body" style="display:block">
<div id = "momentumCanvas" style="position:absolute; left:45px"></div>
<div id = "momentumAnnotation" style="position:absolute; width: 204px; height: 80px; left: 630px; top: 20px;"></div>
<div style="position:absolute; width: 204px; height: 80px; left: 643px; top: 10px;" class ="figtext" >
Convergence Rate
</div>
<figcaption style="position:absolute; width: 204px; height: 80px; left: 645px; top: 86px;">
A plot of <dt-math>\max\{|\sigma_1|, |\sigma_2|\}</dt-math> reveals distinct regions, each with its own style of convergence.
</figcaption>
</div>
<div id = "taxonomy"></div>
<svg id="momentumOverlay" style="position:absolute; width:984px; height:540px; z-index:4; pointer-events:none"></svg>
</figure>
<script src="assets/momentum.js"></script>
<script>
deleteQueue.push(renderLoading(d3.select("#momentum2D")))
renderQueue.push(function(callback) {
var defaults = [[0.0015, 0.9],
[0.0015, 0.125],
[0.01, 0.00001],
[0.02, 0.05 ],
[0.025, 0.235 ]]
coor = render2DSliderGen(
function(a,b,bold) {
var xy = coor(a,b)
updatePaths[0](xy[0], xy[1],bold)
updateStemGraphs[0](a,b)
},
function(a,b,bold) {
var xy = coor(a,b)
updatePaths[1](xy[0], xy[1],bold)
updateStemGraphs[1](a,b)
},
function(a,b,bold) {
var xy = coor(a,b)
updatePaths[2](xy[0], xy[1],bold)
updateStemGraphs[2](a,b)
},
function(a,b,bold) {
var xy = coor(a,b)
updatePaths[3](xy[0], xy[1],bold)
updateStemGraphs[3](a,b)
},
function(a,b,bold) {
var xy = coor(a,b)
updatePaths[4](xy[0], xy[1],bold)
updateStemGraphs[4](a,b)
}, defaults)(d3.select("#momentumCanvas"))
var tax = renderTaxonomy(d3.select("#momentum2D"))
var updatePaths = renderOverlay(d3.select("#momentumOverlay"), tax.div)
var updateStemGraphs = tax.update
colorMap(
d3.select("#momentumAnnotation"),
180,
d3.scaleLinear().domain([0,0.3,0.5,0.7,1,1.01]).range(colorbrewer.YlGnBu[5].concat(["black"])),
d3.scaleLinear().domain([0,1.2001]).range([0, 180])
)
var up = function (i, alpha, beta) {
var xy = coor(alpha, beta)
updatePaths[i](xy[0], xy[1], true)
updateStemGraphs[i](alpha,beta)
}
for (var i = 0; i<5; i++) {
up(i,defaults[i][0], defaults[i][1])