-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcite.bib
10384 lines (9133 loc) · 491 KB
/
cite.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@misc{leviathan2023:specdec,
title={Fast Inference from Transformers via Speculative Decoding},
author={Yaniv Leviathan and Matan Kalman and Yossi Matias},
year={2023},
eprint={2211.17192},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2211.17192},
}
@misc{dubey2024llama3herdmodels,
title={The Llama 3 Herd of Models},
author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and others},
year={2024},
eprint={2407.21783},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2407.21783}
}
@inproceedings{shi2017tnvnn,
author={Shi, Shaohuai and Xu, Pengfei and Chu, Xiaowen},
booktitle={2017 IEEE 23rd International Conference on Parallel and Distributed Systems (ICPADS)},
title={Supervised Learning Based Algorithm Selection for Deep Neural Networks},
year={2017},
volume={},
number={},
pages={344-351},
keywords={Graphics processing units;Kernel;Artificial neural networks;Machine learning;Libraries;Machine learning algorithms;Linear Algebra;Matrix Multiplication;Transpose;GPU;Deep Neural Networks},
doi={10.1109/ICPADS.2017.00053}}
@Manual{bokeh,
title = {Bokeh: Python library for interactive visualization},
author = {{Bokeh Development Team}},
year = {2024},
url = {https://bokeh.org/},
}
@article{paraver,
author = {Computadors, Departament and Pillet, Vincent and Labarta, Jesús and Cortes, Toni and Girona, Sergi},
year = {1995},
month = {03},
pages = {},
title = {PARAVER: A tool to visualize and analyze parallel code},
volume = {44},
journal = {WoTUG-18}
}
@article{grynbaum2023times,
title={The times sues openai and microsoft over ai use of copyrighted work},
author={Grynbaum, Michael M and Mac, Ryan},
journal={The New York Times},
volume={27},
year={2023}
}
@misc{parmar2024nemotron4,
title={Nemotron-4 15B Technical Report},
author={Jupinder Parmar and Shrimai Prabhumoye and Joseph Jennings and Mostofa Patwary and Sandeep Subramanian and Dan Su and Chen Zhu and Deepak Narayanan and Aastha Jhunjhunwala and Ayush Dattagupta and Vibhu Jawa and Jiwei Liu and Ameya Mahabaleshwarkar and Osvald Nitski and Annika Brundyn and James Maki and Miguel Martinez and Jiaxuan You and John Kamalu and Patrick LeGresley and Denys Fridman and Jared Casper and Ashwath Aithal and Oleksii Kuchaiev and Mohammad Shoeybi and Jonathan Cohen and Bryan Catanzaro},
year={2024},
eprint={2402.16819},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{pytorch-profiler,
author = {Meta},
title = {PyTorch Profiler},
year = {},
publisher = {},
journal = {},
howpublished = {\url{https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html}},
commit = {}
}
@article{jumpshot,
author = {Omer Zaki and Ewing Lusk and William Gropp and Deborah Swider},
title ={Toward Scalable Performance Visualization with Jumpshot},
journal = {The International Journal of High Performance Computing Applications},
volume = {13},
number = {3},
pages = {277-288},
year = {1999},
doi = {10.1177/109434209901300310},
URL = {
https://doi.org/10.1177/109434209901300310
},
eprint = {
https://doi.org/10.1177/109434209901300310
}
}
@software{omnitrace,
title = {Omnitrace: Application Profiling, Tracing, and Analysis},
author = {AMD Research},
year = {},
journal = {GitHub repository},
publisher = {GitHub},
howpublished = {\url{https://github.com/AMDResearch/omnitrace}}
}
@software{hta,
title = {Holistic Trace Analysis (HTA): A library to analyze PyTorch traces},
author = {Meta Research},
year = {},
journal = {GitHub repository},
publisher = {GitHub},
howpublished = {\url{https://github.com/facebookresearch/HolisticTraceAnalysis}}
}
@article{lamport,
author = {Lamport, Leslie},
title = {Time, clocks, and the ordering of events in a distributed system},
year = {1978},
issue_date = {July 1978},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {21},
number = {7},
issn = {0001-0782},
url = {https://doi.org/10.1145/359545.359563},
doi = {10.1145/359545.359563},
abstract = {The concept of one event happening before another in a distributed system is examined, and is shown to define a partial ordering of the events. A distributed algorithm is given for synchronizing a system of logical clocks which can be used to totally order the events. The use of the total ordering is illustrated with a method for solving synchronization problems. The algorithm is then specialized for synchronizing physical clocks, and a bound is derived on how far out of synchrony the clocks can become.},
journal = {Commun. ACM},
month = {jul},
pages = {558–565},
numpages = {8},
keywords = {multiprocess systems, distributed systems, computer networks, clock synchronization}
}
@ARTICLE{lateness,
author={Isaacs, Katherine E. and Gamblin, Todd and Bhatele, Abhinav and Schulz, Martin and Hamann, Bernd and Bremer, Peer-Timo},
journal={IEEE Transactions on Parallel and Distributed Systems},
title={Ordering Traces Logically to Identify Lateness in Message Passing Programs},
year={2016},
volume={27},
number={3},
pages={829-840},
keywords={Visualization;Partitioning algorithms;Merging;Message passing;Delays;Trace analysis;performance;Trace analysis;performance},
doi={10.1109/TPDS.2015.2417531}}
@inproceedings{trace-vis-task-dependencies,
author = {Haugen, Blake and Richmond, Stephen and Kurzak, Jakub and Steed, Chad A. and Dongarra, Jack},
title = {Visualizing execution traces with task dependencies},
year = {2015},
isbn = {9781450340137},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2835238.2835240},
doi = {10.1145/2835238.2835240},
abstract = {Task-based scheduling has emerged as one method to reduce the complexity of parallel computing. When using task-based schedulers, developers must frame their computation as a series of tasks with various data dependencies. The scheduler can take these tasks, along with their input and output dependencies, and schedule the task in parallel across a node or cluster. While these schedulers simplify the process of parallel software development, they can obfuscate the performance characteristics of the execution of an algorithm.The execution trace has been used for many years to give developers a visual representation of how their computations are performed. These methods can be employed to visualize when and where each of the tasks in a task-based algorithm is scheduled. In addition, the task dependencies can be used to create a directed acyclic graph (DAG) that can also be visualized to demonstrate the dependencies of the various tasks that make up a workload. The work presented here aims to combine these two data sets and extend execution trace visualization to better suit task-based workloads.This paper presents a brief description of task-based schedulers and the performance data they produce. It will then describe an interactive extension to the current trace visualization methods that combines the trace and DAG data sets. This new tool allows users to gain a greater understanding of how their tasks are scheduled. It also provides a simplified way for developers to evaluate and debug the performance of their scheduler.},
booktitle = {Proceedings of the 2nd Workshop on Visual Performance Analysis},
articleno = {2},
numpages = {8},
keywords = {task-based scheduling, execution trace, data movement, DAG},
location = {Austin, Texas},
series = {VPA '15}
}
@misc{litgpt-2023,
author = {{Lightning AI}},
title = {LitGPT},
howpublished = {\url{https://github.com/Lightning-AI/litgpt}},
year = {2023},
}
@InProceedings{rabenseifneroptimization2004,
author="Rabenseifner, Rolf",
editor="Bubak, Marian
and van Albada, Geert Dick
and Sloot, Peter M. A.
and Dongarra, Jack",
title="Optimization of Collective Reduction Operations",
booktitle="Computational Science - ICCS 2004",
year="2004",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="1--9",
abstract="A 5-year-profiling in production mode at the University of Stuttgart has shown that more than 40{\%} of the execution time of Message Passing Interface (MPI) routines is spent in the collective communication routines MPI{\_}Allreduce and MPI{\_}Reduce. Although MPI implementations are now available for about 10 years and all vendors are committed to this Message Passing Interface standard, the vendors' and publicly available reduction algorithms could be accelerated with new algorithms by a factor between 3 (IBM, sum) and 100 (Cray T3E, maxloc) for long vectors. This paper presents five algorithms optimized for different choices of vector size and number of processes. The focus is on bandwidth dominated protocols for power-of-two and non-power-of-two number of processes, optimizing the load balance in communication and computation.",
isbn="978-3-540-24685-5"
}
@InProceedings{thakurimproving2003,
author="Thakur, Rajeev
and Gropp, William D.",
editor="Dongarra, Jack
and Laforenza, Domenico
and Orlando, Salvatore",
title="Improving the Performance of Collective Operations in MPICH",
booktitle="Recent Advances in Parallel Virtual Machine and Message Passing Interface",
year="2003",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="257--267",
abstract="We report on our work on improving the performance of collective operations in MPICH on clusters connected by switched networks. For each collective operation, we use multiple algorithms depending on the message size, with the goal of minimizing latency for short messages and minimizing bandwidth usage for long messages. Although we have implemented new algorithms for all MPI collective operations, because of limited space we describe only the algorithms for allgather, broadcast, reduce-scatter, and reduce. We present performance results using the SKaMPI benchmark on a Myrinet-connected Linux cluster and an IBM SP. In all cases, the new algorithms significantly outperform the old algorithms used in MPICH on the Myrinet cluster, and, in many cases, they outperform the algorithms used in IBM's MPI on the SP.",
isbn="978-3-540-39924-7"
}
@Inproceedings{Zhang2023,
author = {Zhen Zhang and Shuai Zheng and Yida Wang and Justin Chiu and George Karypis and Trishul Chilimbi and Mu Li and Xin Jin},
title = {MiCS: Near linear scaling for training gigantic model on public cloud},
year = {2023},
url = {https://www.amazon.science/publications/mics-near-linear-scaling-for-training-gigantic-model-on-public-cloud},
booktitle = {VLDB 2023},
}
@inproceedings{wang2024zero,
title={Ze{RO}++: Extremely Efficient Collective Communication for Large Model Training},
author={Guanhua Wang and Heyang Qin and Sam Ade Jacobs and Xiaoxia Wu and Connor Holmes and Zhewei Yao and Samyam Rajbhandari and Olatunji Ruwase and Feng Yan and Lei Yang and Yuxiong He},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=gx2BT0a9MQ}
}
@inproceedings{black-etal-2022-gpt,
title = "{GPT}-{N}eo{X}-20{B}: An Open-Source Autoregressive Language Model",
author = "Black, Sidney and
Biderman, Stella and
Hallahan, Eric and
Anthony, Quentin and
Gao, Leo and
Golding, Laurence and
He, Horace and
Leahy, Connor and
McDonell, Kyle and
Phang, Jason and
Pieler, Michael and
Prashanth, Usvsn Sai and
Purohit, Shivanshu and
Reynolds, Laria and
Tow, Jonathan and
Wang, Ben and
Weinbach, Samuel",
editor = "Fan, Angela and
Ilic, Suzana and
Wolf, Thomas and
Gall{\'e}, Matthias",
booktitle = "Proceedings of BigScience Episode {\#}5 -- Workshop on Challenges {\&} Perspectives in Creating Large Language Models",
month = may,
year = "2022",
address = "virtual+Dublin",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.bigscience-1.9",
doi = "10.18653/v1/2022.bigscience-1.9",
pages = "95--136",
}
@misc{luo2019adaptive,
title={Adaptive Gradient Methods with Dynamic Bound of Learning Rate},
author={Liangchen Luo and Yuanhao Xiong and Yan Liu and Xu Sun},
year={2019},
eprint={1902.09843},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{keskar2017improving,
title={Improving Generalization Performance by Switching from Adam to SGD},
author={Nitish Shirish Keskar and Richard Socher},
year={2017},
eprint={1712.07628},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@article{zhuang2020adabelief,
title={AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients},
author={Zhuang, Juntang and Tang, Tommy and Ding, Yifan and Tatikonda, Sekhar C and Dvornek, Nicha and Papademetris, Xenophon and Duncan, James},
journal={Advances in Neural Information Processing Systems},
volume={33},
year={2020}
}
@misc{he2016identity,
title={Identity Mappings in Deep Residual Networks},
author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
year={2016},
eprint={1603.05027},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@InProceedings{tang2021skfac,
author = {Tang, Zedong and Jiang, Fenlong and Gong, Maoguo and Li, Hao and Wu, Yue and Yu, Fan and Wang, Zidong and Wang, Min},
title = {SKFAC: Training Neural Networks With Faster Kronecker-Factored Approximate Curvature},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2021},
pages = {13479-13487}
}
@inproceedings{
zhang2023eva,
title={Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation},
author={Lin Zhang and Shaohuai Shi and Bo Li},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=_Mic8V96Voy}
}
@misc{wang2017stochastic,
title={Stochastic Quasi-Newton Methods for Nonconvex Stochastic Optimization},
author={Xiao Wang and Shiqian Ma and Donald Goldfarb and Wei Liu},
year={2017},
eprint={1607.01231},
archivePrefix={arXiv},
primaryClass={math.OC}
}
@misc{bollapragada2018progressive,
title={A Progressive Batching L-BFGS Method for Machine Learning},
author={Raghu Bollapragada and Dheevatsa Mudigere and Jorge Nocedal and Hao-Jun Michael Shi and Ping Tak Peter Tang},
year={2018},
eprint={1802.05374},
archivePrefix={arXiv},
primaryClass={math.OC}
}
@misc{berahas2016multibatch,
title={A Multi-Batch L-BFGS Method for Machine Learning},
author={Albert S. Berahas and Jorge Nocedal and Martin Takáč},
year={2016},
eprint={1605.06049},
archivePrefix={arXiv},
primaryClass={math.OC}
}
@misc{erdogdu2015convergence,
title={Convergence rates of sub-sampled Newton methods},
author={Murat A. Erdogdu and Andrea Montanari},
year={2015},
eprint={1508.02810},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{perlmutter,
author = {NERSC},
title = {Perlmutter System Architecture},
year = {},
publisher = {},
journal = {},
howpublished = {\url{https://docs.nersc.gov/systems/perlmutter/architecture/}},
commit = {}
}
@inproceedings{heo2021adamp,
title={AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights},
author={Heo, Byeongho and Chun, Sanghyuk and Oh, Seong Joon and Han, Dongyoon and Yun, Sangdoo and Kim, Gyuwan and Uh, Youngjung and Ha, Jung-Woo},
year={2021},
booktitle={International Conference on Learning Representations (ICLR)},
}
@software{torchvision2016,
title = {TorchVision: PyTorch's Computer Vision library},
author = {TorchVision maintainers and contributors},
year = 2016,
journal = {GitHub repository},
publisher = {GitHub},
howpublished = {\url{https://github.com/pytorch/vision}}
}
@misc{mozaffari2023mkor,
title={MKOR: Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates},
author={Mohammad Mozaffari and Sikan Li and Zhao Zhang and Maryam Mehri Dehnavi},
year={2023},
eprint={2306.01685},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{shi2023distributed,
title={A Distributed Data-Parallel PyTorch Implementation of the Distributed Shampoo Optimizer for Training Neural Networks At-Scale},
author={Hao-Jun Michael Shi and Tsung-Hsien Lee and Shintaro Iwasaki and Jose Gallego-Posada and Zhijing Li and Kaushik Rangadurai and Dheevatsa Mudigere and Michael Rabbat},
year={2023},
eprint={2309.06497},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{shi2021accelerating,
title={Accelerating Distributed K-FAC with Smart Parallelism of Computing and Communication Tasks},
author={Shaohuai Shi and Lin Zhang and Bo Li},
year={2021},
eprint={2107.06533},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
@inproceedings{ueno2020rich,
author = {Ueno, Yuichiro and Osawa, Kazuki and Tsuji, Yohei and Naruse, Akira and Yokota, Rio},
title = {Rich Information is Affordable: A Systematic Performance Analysis of Second-Order Optimization Using K-FAC},
year = {2020},
isbn = {9781450379984},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3394486.3403265},
doi = {10.1145/3394486.3403265},
abstract = {Rich information matrices from first and second-order derivatives have many potential applications in both theoretical and practical problems in deep learning. However, computing these information matrices is extremely expensive and this enormous cost is currently limiting its application to important problems regarding generalization, hyperparameter tuning, and optimization of deep neural networks. One of the most challenging use cases of information matrices is their use as a preconditioner for the optimizers, since the information matrices need to be updated every step. In this work, we conduct a step-by-step performance analysis when computing the Fisher information matrix during training of ResNet-50 on ImageNet, and show that the overhead can be reduced to the same amount as the cost of performing a single SGD step. We also show that the resulting Fisher preconditioned optimizer can converge in 1/3 the number of epochs compared to SGD, while achieving the same Top-1 validation accuracy. This is the first work to achieve such accuracy with K-FAC while reducing the training time to match that of SGD.},
booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages = {2145–2153},
numpages = {9},
keywords = {distributed training, information matrix, performance optimization},
location = {Virtual Event, CA, USA},
series = {KDD '20}
}
@misc{osawa2020scalable,
title={Scalable and Practical Natural Gradient for Large-Scale Deep Learning},
author={Kazuki Osawa and Yohei Tsuji and Yuichiro Ueno and Akira Naruse and Chuan-Sheng Foo and Rio Yokota},
year={2020},
eprint={2002.06015},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{osawa2019largescale,
title={Large-Scale Distributed Second-Order Optimization Using Kronecker-Factored Approximate Curvature for Deep Convolutional Neural Networks},
author={Kazuki Osawa and Yohei Tsuji and Yuichiro Ueno and Akira Naruse and Rio Yokota and Satoshi Matsuoka},
year={2019},
eprint={1811.12019},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@inproceedings{pauloski-2020-kfac,
author = {Pauloski, J. Gregory and Zhang, Zhao and Huang, Lei and Xu, Weijia and Foster, Ian T.},
title = {Convolutional {N}eural {N}etwork {T}raining with {D}istributed {K}-{FAC}},
year = {2020},
isbn = {9781728199986},
publisher = {IEEE Press},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
articleno = {94},
numpages = {14},
location = {Atlanta, Georgia},
series = {SC '20},
doi = {10.5555/3433701.3433826}
}
@inproceedings{pauloski-kaisa-2021,
doi = {10.1145/3458817.3476152},
url = {https://doi.org/10.1145%2F3458817.3476152},
year = 2021,
month = {nov},
publisher = {{ACM}
},
author = {J. Gregory Pauloski and Qi Huang and Lei Huang and Shivaram Venkataraman and Kyle Chard and Ian Foster and Zhao Zhang},
title = {{KAISA}},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}
}
@InProceedings{agarwal-ggt-2019,
title = {Efficient Full-Matrix Adaptive Regularization},
author = {Agarwal, Naman and Bullins, Brian and Chen, Xinyi and Hazan, Elad and Singh, Karan and Zhang, Cyril and Zhang, Yi},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {102--110},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
month = {09--15 Jun},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/agarwal19b/agarwal19b.pdf},
url = {https://proceedings.mlr.press/v97/agarwal19b.html},
abstract = {Adaptive regularization methods pre-multiply a descent direction by a preconditioning matrix. Due to the large number of parameters of machine learning problems, full-matrix preconditioning methods are prohibitively expensive. We show how to modify full-matrix adaptive regularization in order to make it practical and effective. We also provide a novel theoretical analysis for adaptive regularization in <em>non-convex</em> optimization settings. The core of our algorithm, termed GGT, consists of the efficient computation of the inverse square root of a low-rank matrix. Our preliminary experiments show improved iteration-wise convergence rates across synthetic tasks and standard deep learning benchmarks, and that the more carefully-preconditioned steps sometimes lead to a better solution.}
}
@inproceedings{hessian-free-rnn,
author = {Martens, James and Sutskever, Ilya},
title = {Learning Recurrent Neural Networks with Hessian-Free Optimization},
year = {2011},
isbn = {9781450306195},
publisher = {Omnipress},
address = {Madison, WI, USA},
abstract = {In this work we resolve the long-outstanding problem of how to effectively train recurrent neural networks (RNNs) on complex and difficult sequence modeling problems which may contain long-term data dependencies. Utilizing recent advances in the Hessian-free optimization approach (Martens, 2010), together with a novel damping scheme, we successfully train RNNs on two sets of challenging problems. First, a collection of pathological synthetic datasets which are known to be impossible for standard optimization approaches (due to their extremely long-term dependencies), and second, on three natural and highly complex real-world sequence datasets where we find that our method significantly outperforms the previous state-of-the-art method for training neural sequence models: the Long Short-term Memory approach of Hochreiter and Schmidhuber (1997). Additionally, we offer a new interpretation of the generalized Gauss-Newton matrix of Schraudolph (2002) which is used within the HF approach of Martens.},
booktitle = {Proceedings of the 28th International Conference on International Conference on Machine Learning},
pages = {1033–1040},
numpages = {8},
location = {Bellevue, Washington, USA},
series = {ICML'11}
}
@misc{grosse2016kfacconvolution,
title={A Kronecker-factored approximate Fisher matrix for convolution layers},
author={Roger Grosse and James Martens},
year={2016},
eprint={1602.01407},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{zhu2019anisotropic,
title={The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Sharp Minima and Regularization Effects},
author={Zhanxing Zhu and Jingfeng Wu and Bing Yu and Lei Wu and Jinwen Ma},
year={2019},
eprint={1803.00195},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{sagun2018empirical,
title={Empirical Analysis of the Hessian of Over-Parametrized Neural Networks},
author={Levent Sagun and Utku Evci and V. Ugur Guney and Yann Dauphin and Leon Bottou},
year={2018},
eprint={1706.04454},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{jastrzebski2018factors,
title={Three Factors Influencing Minima in SGD},
author={Stanislaw Jastrzebski and Zachary Kenton and Devansh Arpit and Nicolas Ballas and Asja Fischer and Yoshua Bengio and Amos Storkey},
year={2018},
eprint={1711.04623},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@article{duchi:jmlr2011,
author = {John Duchi and Elad Hazan and Yoram Singer},
title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
journal = {Journal of Machine Learning Research},
year = {2011},
volume = {12},
number = {61},
pages = {2121--2159},
url = {http://jmlr.org/papers/v12/duchi11a.html}
}
@article{martens:jmlr2020,
author = {Martens, James},
title = {New Insights and Perspectives on the Natural Gradient Method},
year = {2020},
issue_date = {January 2020},
publisher = {JMLR.org},
volume = {21},
number = {1},
issn = {1532-4435},
abstract = {Natural gradient descent is an optimization method traditionally motivated from the perspective of information geometry, and works well for many applications as an alternative to stochastic gradient descent. In this paper we critically analyze this method and its properties, and show how it can be viewed as a type of 2nd-order optimization method, with the Fisher information matrix acting as a substitute for the Hessian. In many important cases, the Fisher information matrix is shown to be equivalent to the Generalized Gauss-Newton matrix, which both approximates the Hessian, but also has certain properties that favor its use over the Hessian. This perspective turns out to have significant implications for the design of a practical and robust natural gradient optimizer, as it motivates the use of techniques like trust regions and Tikhonov regularization. Additionally, we make a series of contributions to the understanding of natural gradient and 2nd-order methods, including: a thorough analysis of the convergence speed of stochastic natural gradient descent (and more general stochastic 2nd-order methods) as applied to convex quadratics, a critical examination of the oft-used "empirical" approximation of the Fisher matrix, and an analysis of the (approximate) parameterization invariance property possessed by natural gradient methods (which we show also holds for certain other curvature matrices, but notably not the Hessian).},
journal = {J. Mach. Learn. Res.},
month = {jan},
articleno = {146},
numpages = {76},
keywords = {neural networks, convergence rate, parameterization invariance, natural gradient methods, 2nd-order optimization}
}
@inproceedings{kunstner:neurips2019,
author = {Kunstner, Frederik and Hennig, Philipp and Balles, Lukas},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Limitations of the empirical Fisher approximation for natural gradient descent},
url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/46a558d97954d0692411c861cf78ef79-Paper.pdf},
volume = {32},
year = {2019}
}
@inproceedings{tonga,
author = {Roux, Nicolas and Manzagol, Pierre-antoine and Bengio, Yoshua},
booktitle = {Advances in Neural Information Processing Systems},
editor = {J. Platt and D. Koller and Y. Singer and S. Roweis},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Topmoumoute Online Natural Gradient Algorithm},
url = {https://proceedings.neurips.cc/paper_files/paper/2007/file/9f61408e3afb633e50cdf1b20de6f466-Paper.pdf},
volume = {20},
year = {2007}
}
@inproceedings{Desjardins:nips2015,
author = {Desjardins, Guillaume and Simonyan, Karen and Pascanu, Razvan and kavukcuoglu, koray},
booktitle = {Advances in Neural Information Processing Systems},
editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Natural Neural Networks},
url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/2de5d16682c3c35007e4e92982f1a2ba-Paper.pdf},
volume = {28},
year = {2015}
}
@article{Park2000AdaptiveNG,
title={Adaptive natural gradient learning algorithms for various stochastic models},
author={Hyeyoung Park and Shun‐ichi Amari and Kenji Fukumizu},
journal={Neural networks : the official journal of the International Neural Network Society},
year={2000},
volume={13 7},
pages={
755-64
},
url={https://api.semanticscholar.org/CorpusID:6471036}
}
@article{ngd-og,
author = {Amari, Shun-ichi},
title = "{Natural Gradient Works Efficiently in Learning}",
journal = {Neural Computation},
volume = {10},
number = {2},
pages = {251-276},
year = {1998},
month = {02},
abstract = "{When a parameter space has a certain underlying structure, the ordinary gradient of a function does not represent its steepest direction, but the natural gradient does. Information geometry is used for calculating the natural gradients in the parameter space of perceptrons, the space of matrices (for blind source separation), and the space of linear dynamical systems (for blind source deconvolution). The dynamical behavior of natural gradient online learning is analyzed and is proved to be Fisher efficient, implying that it has asymptotically the same performance as the optimal batch estimation of parameters. This suggests that the plateau phenomenon, which appears in the backpropagation learning algorithm of multilayer perceptrons, might disappear or might not be so serious when the natural gradient is used. An adaptive method of updating the learning rate is proposed and analyzed.}",
issn = {0899-7667},
doi = {10.1162/089976698300017746},
url = {https://doi.org/10.1162/089976698300017746},
eprint = {https://direct.mit.edu/neco/article-pdf/10/2/251/813415/089976698300017746.pdf},
}
@InProceedings{botev-practical-17,
title = {Practical {G}auss-{N}ewton Optimisation for Deep Learning},
author = {Aleksandar Botev and Hippolyt Ritter and David Barber},
booktitle = {Proceedings of the 34th International Conference on Machine Learning},
pages = {557--565},
year = {2017},
editor = {Precup, Doina and Teh, Yee Whye},
volume = {70},
series = {Proceedings of Machine Learning Research},
month = {06--11 Aug},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v70/botev17a/botev17a.pdf},
url = {https://proceedings.mlr.press/v70/botev17a.html},
abstract = {We present an efficient block-diagonal approximation to the Gauss-Newton matrix for feedforward neural networks. Our resulting algorithm is competitive against state-of-the-art first-order optimisation methods, with sometimes significant improvement in optimisation performance. Unlike first-order methods, for which hyperparameter tuning of the optimisation parameters is often a laborious process, our approach can provide good performance even when used with default settings. A side result of our work is that for piecewise linear transfer functions, the network objective function can have no differentiable local maxima, which may partially explain why such transfer functions facilitate effective optimisation.}
}
@InProceedings{krylov-subspace-descent,
title = {Krylov Subspace Descent for Deep Learning},
author = {Vinyals, Oriol and Povey, Daniel},
booktitle = {Proceedings of the Fifteenth International Conference on Artificial Intelligence and Statistics},
pages = {1261--1268},
year = {2012},
editor = {Lawrence, Neil D. and Girolami, Mark},
volume = {22},
series = {Proceedings of Machine Learning Research},
address = {La Palma, Canary Islands},
month = {21--23 Apr},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v22/vinyals12/vinyals12.pdf},
url = {https://proceedings.mlr.press/v22/vinyals12.html},
abstract = {In this paper, we propose a second order optimization method to learn models where both the dimensionality of the parameter space and the number of training samples is high. In our method, we construct on each iteration a Krylov subspace formed by the gradient and an approximation to the Hessian matrix, and then use a subset of the training data samples to optimize over this subspace. As with the Hessian Free (HF) method of Martens (2010), the Hessian matrix is never explicitly constructed, and is computed using a subset of data. In practice, as in HF, we typically use a positive definite substitute for the Hessian matrix such as the Gauss-Newton matrix. We investigate the effectiveness of our proposed method on deep neural networks, and compare its performance to widely used methods such as stochastic gradient descent, conjugate gradient descent and L-BFGS, and also to HF. Our method leads to faster convergence than either L-BFGS or HF, and generally performs better than either of them in cross-validation accuracy. It is also simpler and more general than HF, as it does not require a positive semidefinite approximation of the Hessian matrix to work well nor the setting of a damping parameter. The chief drawback versus HF is the need for memory to store a basis for the Krylov subspace.}
}
@inproceedings{hessian-free-optimization,
added-at = {2011-07-08T14:11:15.000+0200},
author = {Martens, James},
biburl = {https://www.bibsonomy.org/bibtex/2af0029f21446a26c04f2e4650ec1fbf1/gromgull},
booktitle = {ICML},
editor = {Fürnkranz, Johannes and Joachims, Thorsten},
ee = {http://www.icml2010.org/papers/458.pdf},
interhash = {1d6577ca73270732c2cc1e3c2cce6cdb},
intrahash = {af0029f21446a26c04f2e4650ec1fbf1},
keywords = {machinelearning neural-networks optimisation recurrent-neural-networks},
pages = {735-742},
publisher = {Omnipress},
timestamp = {2011-07-08T14:11:15.000+0200},
title = {Deep learning via Hessian-free optimization.},
url = {http://dblp.uni-trier.de/db/conf/icml/icml2010.html#Martens10},
year = 2010
}
@article{schraudolphGGN,
author = {Schraudolph, Nicol N.},
title = {Fast Curvature Matrix-Vector Products for Second-Order Gradient Descent},
year = {2002},
issue_date = {July 2002},
publisher = {MIT Press},
address = {Cambridge, MA, USA},
volume = {14},
number = {7},
issn = {0899-7667},
url = {https://doi.org/10.1162/08997660260028683},
doi = {10.1162/08997660260028683},
abstract = {We propose a generic method for iteratively approximating various second-order gradient steps--Newton, Gauss-Newton, Levenberg-Marquardt, and natural gradient--in linear time per iteration, using special curvature matrix-vector products that can be computed in O(n). Two recent acceleration techniques for on-line learning, matrix momentum and stochastic meta-descent (SMD), implement this approach. Since both were originally derived by very different routes, this offers fresh insight into their operation, resulting in further improvements to SMD.},
journal = {Neural Comput.},
month = {jul},
pages = {1723–1738},
numpages = {16}
}
@inproceedings{loshchilov2017sgdr,
title={{SGDR}: Stochastic Gradient Descent with Warm Restarts},
author={Ilya Loshchilov and Frank Hutter},
booktitle={International Conference on Learning Representations},
year={2017},
url={https://openreview.net/forum?id=Skq89Scxx}
}
@misc{grafting,
title={Disentangling Adaptive Gradient Methods from Learning Rates},
author={Naman Agarwal and Rohan Anil and Elad Hazan and Tomer Koren and Cyril Zhang},
year={2020},
eprint={2002.11803},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@inproceedings{adapative-optimizer-bad-sgd-good,
author = {Wilson, Ashia C and Roelofs, Rebecca and Stern, Mitchell and Srebro, Nati and Recht, Benjamin},
booktitle = {Advances in Neural Information Processing Systems},
editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {The Marginal Value of Adaptive Gradient Methods in Machine Learning},
url = {https://proceedings.neurips.cc/paper_files/paper/2017/file/81b3833e2504647f9d794f7d7b9bf341-Paper.pdf},
volume = {30},
year = {2017}
}
@inproceedings{dense-net,
added-at = {2018-09-04T11:33:01.000+0200},
author = {Huang, Gao and Liu, Zhuang and van der Maaten, Laurens and Weinberger, Kilian Q.},
biburl = {https://www.bibsonomy.org/bibtex/24ea2e82bd87f8102b9f1f14a98b4dc53/nosebrain},
booktitle = {CVPR},
ee = {http://doi.ieeecomputersociety.org/10.1109/CVPR.2017.243},
interhash = {39c8ce8d8104d4c557d508eb421fb90c},
intrahash = {4ea2e82bd87f8102b9f1f14a98b4dc53},
isbn = {978-1-5386-0457-1},
keywords = {classification densenet image},
pages = {2261-2269},
publisher = {IEEE Computer Society},
timestamp = {2018-09-04T11:41:32.000+0200},
title = {Densely Connected Convolutional Networks},
url = {http://dblp.uni-trier.de/db/conf/cvpr/cvpr2017.html#HuangLMW17},
year = 2017
}
@incollection{ssd,
doi = {10.1007/978-3-319-46448-0_2},
url = {https://doi.org/10.1007%2F978-3-319-46448-0_2},
year = 2016,
publisher = {Springer International Publishing},
pages = {21--37},
author = {Wei Liu and Dragomir Anguelov and Dumitru Erhan and Christian Szegedy and Scott Reed and Cheng-Yang Fu and Alexander C. Berg},
title = {{SSD}: Single Shot {MultiBox} Detector},
booktitle = {Computer Vision {\textendash} {ECCV} 2016}
}
@incollection{alexnet,
added-at = {2016-11-14T12:05:24.000+0100},
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
biburl = {https://www.bibsonomy.org/bibtex/2886c491fe45049fee3c9660df30bb5c4/albinzehe},
booktitle = {Advances in Neural Information Processing Systems 25},
editor = {Pereira, F. and Burges, C. J. C. and Bottou, L. and Weinberger, K. Q.},
interhash = {74bbb5dea5afb1b088bd10e317f1f0d2},
intrahash = {886c491fe45049fee3c9660df30bb5c4},
keywords = {cnn deeplearning ma-zehe neuralnet},
pages = {1097--1105},
publisher = {Curran Associates, Inc.},
timestamp = {2016-11-14T12:05:24.000+0100},
title = {ImageNet Classification with Deep Convolutional Neural Networks},
url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf},
year = 2012
}
@article{sgd-nesterov,
title={A method for solving the convex programming problem with convergence rate $\mathcal{O}(1/k^2)$},
author={Yurii Nesterov},
journal={Proceedings of the USSR Academy of Sciences},
year={1983},
volume={269},
pages={543-547},
url={https://api.semanticscholar.org/CorpusID:145918791}
}
@article{sgd-momentum,
title = {Some methods of speeding up the convergence of iteration methods},
journal = {USSR Computational Mathematics and Mathematical Physics},
volume = {4},
number = {5},
pages = {1-17},
year = {1964},
issn = {0041-5553},
doi = {https://doi.org/10.1016/0041-5553(64)90137-5},
url = {https://www.sciencedirect.com/science/article/pii/0041555364901375},
author = {B.T. Polyak},
abstract = {For the solution of the functional equation P (x) = 0 (1) (where P is an operator, usually linear, from B into B, and B is a Banach space) iteration methods are generally used. These consist of the construction of a series x0, …, xn, …, which converges to the solution (see, for example [1]). Continuous analogues of these methods are also known, in which a trajectory x(t), 0 ⩽ t ⩽ ∞ is constructed, which satisfies the ordinary differential equation in B and is such that x(t) approaches the solution of (1) as t → ∞ (see [2]). We shall call the method a k-step method if for the construction of each successive iteration xn+1 we use k previous iterations xn, …, xn−k+1. The same term will also be used for continuous methods if x(t) satisfies a differential equation of the k-th order or k-th degree. Iteration methods which are more widely used are one-step (e.g. methods of successive approximations). They are generally simple from the calculation point of view but often converge very slowly. This is confirmed both by the evaluation of the speed of convergence and by calculation in practice (for more details see below). Therefore the question of the rate of convergence is most important. Some multistep methods, which we shall consider further, which are only slightly more complicated than the corresponding one-step methods, make it possible to speed up the convergence substantially. Note that all the methods mentioned below are applicable also to the problem of minimizing the differentiable functional (x) in Hilbert space, so long as this problem reduces to the solution of the equation grad (x) = 0.}
}
@article{sgd,
author = {Herbert Robbins and Sutton Monro},
title = {{A Stochastic Approximation Method}},
volume = {22},
journal = {The Annals of Mathematical Statistics},
number = {3},
publisher = {Institute of Mathematical Statistics},
pages = {400 -- 407},
year = {1951},
doi = {10.1214/aoms/1177729586},
URL = {https://doi.org/10.1214/aoms/1177729586}
}
@inproceedings{mask-rcnn,
added-at = {2021-07-07T12:16:11.000+0200},
author = {He, Kaiming and Gkioxari, Georgia and Doll{\'{a}}r, Piotr and Girshick, Ross B.},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://www.bibsonomy.org/bibtex/2d2deec4bb1449a5f55dcc9086b669e37/pkoch},
booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice, Italy, October 22-29, 2017},
doi = {10.1109/ICCV.2017.322},
interhash = {3743d2a88223517f9adc496b9ad099bc},
intrahash = {d2deec4bb1449a5f55dcc9086b669e37},
keywords = {instance mask mask-rcnn segmentation},
pages = {2980--2988},
publisher = {{IEEE} Computer Society},
timestamp = {2021-07-07T12:16:11.000+0200},
title = {Mask {R-CNN}},
url = {https://doi.org/10.1109/ICCV.2017.322},
year = 2017
}
@article{iyer2020wideminima,
title={Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule},
author={Iyer, Nikhil and Thejas, V and Kwatra, Nipun and Ramjee, Ramachandran and Sivathanu, Muthian},
journal={arXiv preprint arXiv:2003.03977},
year={2020}
}
@misc{mscoco,
title={Microsoft COCO: Common Objects in Context},
author={Tsung-Yi Lin and Michael Maire and Serge Belongie and Lubomir Bourdev and Ross Girshick and James Hays and Pietro Perona and Deva Ramanan and C. Lawrence Zitnick and Piotr Dollár},
year={2015},
eprint={1405.0312},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{deeplabv3,
title={Rethinking Atrous Convolution for Semantic Image Segmentation},
author={Liang-Chieh Chen and George Papandreou and Florian Schroff and Hartwig Adam},
year={2017},
eprint={1706.05587},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{faster_rcnn,
author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
booktitle = {Advances in Neural Information Processing Systems},
editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/14bfa6bb14875e45bba028a21ed38046-Paper.pdf},
volume = {28},
year = {2015}
}
@misc{shampoo-scalable,
title={Scalable Second Order Optimization for Deep Learning},
author={Rohan Anil and Vineet Gupta and Tomer Koren and Kevin Regan and Yoram Singer},
year={2021},
eprint={2002.09018},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@InProceedings{shampoo-icml,
title = {Shampoo: Preconditioned Stochastic Tensor Optimization},
author = {Gupta, Vineet and Koren, Tomer and Singer, Yoram},
booktitle = {Proceedings of the 35th International Conference on Machine Learning},
pages = {1842--1850},
year = {2018},
editor = {Dy, Jennifer and Krause, Andreas},
volume = {80},
series = {Proceedings of Machine Learning Research},
month = {10--15 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v80/gupta18a/gupta18a.pdf},
url = {https://proceedings.mlr.press/v80/gupta18a.html},
abstract = {Preconditioned gradient methods are among the most general and powerful tools in optimization. However, preconditioning requires storing and manipulating prohibitively large matrices. We describe and analyze a new structure-aware preconditioning algorithm, called Shampoo, for stochastic optimization over tensor spaces. Shampoo maintains a set of preconditioning matrices, each of which operates on a single dimension, contracting over the remaining dimensions. We establish convergence guarantees in the stochastic convex setting, the proof of which builds upon matrix trace inequalities. Our experiments with state-of-the-art deep learning models show that Shampoo is capable of converging considerably faster than commonly used optimizers. Surprisingly, although it involves a more complex update rule, Shampoo’s runtime per step is comparable in practice to that of simple gradient methods such as SGD, AdaGrad, and Adam.}
}
@article{alpa,
author = {Lianmin Zheng and
Zhuohan Li and
Hao Zhang and
Yonghao Zhuang and
Zhifeng Chen and
Yanping Huang and
Yida Wang and
Yuanzhong Xu and
Danyang Zhuo and
Joseph E. Gonzalez and
Ion Stoica},
title = {Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed
Deep Learning},
journal = {CoRR},
volume = {abs/2201.12023},
year = {2022},
eprinttype = {arXiv},
eprint = {2201.12023},
}
@misc{jangda2022breaking,
title={Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads},
author={Abhinav Jangda and Jun Huang and Guodong Liu and Amir Hossein Nodehi Sabet and Saeed Maleki and Youshan Miao and Madanlal Musuvathi and Todd Mytkowicz and Olli Sarikivi},
year={2022},
eprint={2105.05720},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
@misc{afhq-dataset,
title={StarGAN v2: Diverse Image Synthesis for Multiple Domains},
author={Yunjey Choi and Youngjung Uh and Jaejun Yoo and Jung-Woo Ha},
year={2020},
eprint={1912.01865},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{object-detection-survey,
doi = {10.1109/access.2019.2939201},
url = {https://doi.org/10.1109%2Faccess.2019.2939201},
year = 2019,
publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
volume = {7},
pages = {128837--128868},
author = {Licheng Jiao and Fan Zhang and Fang Liu and Shuyuan Yang and Lingling Li and Zhixi Feng and Rong Qu},
title = {A Survey of Deep Learning-Based Object Detection},
journal = {{IEEE} Access}
}
@ARTICLE{image-segmentation-survey,
author={Minaee, Shervin and Boykov, Yuri and Porikli, Fatih and Plaza, Antonio and Kehtarnavaz, Nasser and Terzopoulos, Demetri},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
title={Image Segmentation Using Deep Learning: A Survey},
year={2022},
volume={44},
number={7},
pages={3523-3542},
doi={10.1109/TPAMI.2021.3059968}}
@misc{group-norm,
title={Group Normalization},
author={Yuxin Wu and Kaiming He},
year={2018},
eprint={1803.08494},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{improved-diffusion,
title={Improved Denoising Diffusion Probabilistic Models},
author={Alex Nichol and Prafulla Dhariwal},
year={2021},
eprint={2102.09672},
archivePrefix={arXiv},
primaryClass={cs.LG}
}