public/index.html

<!doctype html>
<meta charset="utf-8">
<meta name="viewport" content="width=1080">
<script src="http://distill.pub/template.v1.js"></script>
<script type="text/front-matter">
  title: Why Momentum Really Works
  description: We often think of optimization with momentum as a ball rolling down a hill. This isn't wrong, but there is much more to the story.
  authors:
    - Gabriel Goh: http://gabgoh.github.io
  affiliations:
    - UC Davis: http://math.ucdavis.edu
</script>

<!-- Katex -->
<!--<script src="assets/lib/auto-render.min.js"></script>-->
<!--<script src="assets/lib/katex.min.js"></script>-->
<link rel="stylesheet" href="assets/lib/katex.min.css">
<link rel="stylesheet" type="text/css" href="assets/widgets.css">

<!-- Required -->
<script src="assets/lib/lib.js"></script>
<script src="assets/utils.js"></script>
<script>
  var renderQueue = [];
  function renderMath(elem) {
    // renderMathInElement(
    //     elem,
    //     {
    //         delimiters: [
    //             {left: "$$", right: "$$", display: true},
    //             {left: "$", right: "$", display: false},
    //         ]
    //     }
    // );
  }

  var deleteQueue = [];
  function renderLoading(figure) {
    var loadingScreen = figure.append("svg")
    .style("width", figure.style("width"))
    .style("height", figure.style("height"))
    .style("position","absolute")
    .style("top", "0px")
    .style("left","0px")
    .style("background","white")
    .style("border", "0px dashed #DDD")
    .style("opacity", 1)

    return function(callback) { loadingScreen.remove() };

  }

</script>
<div id="math-cache" style="display: none;">
  <dt-math class="star">\star</dt-math>
  <dt-math class="plus">+</dt-math>
  <dt-math class="minus">-</dt-math>
  <dt-math class="equals">=</dt-math>
  <dt-math class="alpha">\alpha</dt-math>
  <dt-math class="lambda">\lambda</dt-math>
  <dt-math class="beta">\beta</dt-math>
  <dt-math class="r">R</dt-math>
  <dt-math class="alpha-equals">\alpha=</dt-math>
  <dt-math class="beta-equals">\beta=</dt-math>
  <dt-math class="beta-equals-zero">\beta = 0</dt-math>
  <dt-math class="beta-equals-one">\beta=1</dt-math>
  <dt-math class="alpha-equals-one-over-lambda-i">\alpha = 1/\lambda_i</dt-math>
  <dt-math class="model">\text{model}</dt-math>
  <dt-math class="p">0 p_1</dt-math>
  <dt-math class="phat">0 \bar{p}_1</dt-math>
  <dt-math class="two-sqrt-beta">2\sqrt{\beta}</dt-math>
  <dt-math class="lambda-i">\lambda_i</dt-math>
  <dt-math class="lambda-i-equals-zero">\lambda_i = 0</dt-math>
  <dt-math class="alpha-gt-one-over-lambda-i">\alpha > 1/\lambda_i</dt-math>
  <dt-math class="max-sigma-one">\max\{|\sigma_1|,|\sigma_2|\} > 1</dt-math>
  <dt-math class="x-i-k">x_i^k - x_i^*</dt-math>
  <dt-math class="xi-i">\xi_i</dt-math>
  <dt-math class="beta-equals-one-minus">\beta = (1 - \sqrt{\alpha \lambda_i})^2</dt-math>
</div>
<script>
  function MathCache(id) {
    return document.querySelector("#math-cache ." + id).innerHTML;
  }
</script>
<svg style="display: none;">
  <g id="pointerThingy">
    <circle fill="none" stroke="#FF6C00" stroke-linecap="round" cx="0" cy="0" r="14"/>
    <circle fill="#FF6C00" cx="0" cy="0" r="11"/>
    <path id="XMLID_173_" fill="#FFFFFF" d="M-3.2-1.3c0-0.1,0-0.2,0-0.3c0-0.1,0-0.2,0-0.3c-0.6,0-1.2,0-1.8,0c0,0.6,0,1.2,0,1.8
      c0.2,0,0.4,0,0.6,0c0-0.4,0-0.8,0-1.2c0,0,0.1,0,0.1,0c0.3,0,0.5,0,0.8,0C-3.4-1.3-3.3-1.3-3.2-1.3c0,0.2,0,0.4,0,0.6
      c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0,0,0,0-0.1c0-1.6,0-3.2,0-4.8c0-0.6,0-1.2,0-1.8c0,0,0,0,0.1,0
      c0.3,0,0.7,0,1,0c0.1,0,0.1,0,0.2,0c0-0.2,0-0.4,0-0.6c-0.4,0-0.8,0-1.2,0C-2-7.2-2-7-2-6.8c0,0,0,0-0.1,0c-0.2,0-0.3,0-0.5,0
      c0,0,0,0-0.1,0c0,1.8,0,3.6,0,5.5c-0.2,0-0.3,0-0.4,0C-3.1-1.3-3.2-1.3-3.2-1.3z M1.1-3.7C1-3.8,1-3.8,1.1-3.7C1-4,1-4.1,1-4.3
      c0,0,0,0,0-0.1c-0.4,0-0.8,0-1.2,0c0-0.8,0-1.6,0-2.4c-0.2,0-0.4,0-0.6,0c0,1.8,0,3.6,0,5.5c0.2,0,0.4,0,0.6,0c0-0.8,0-1.6,0-2.4
      c0,0,0.1,0,0.1,0C0.3-3.7,0.6-3.7,1.1-3.7C1-3.7,1-3.7,1.1-3.7C1.1-3.7,1-3.7,1.1-3.7c0,0.8,0,1.6,0,2.3c0,0,0,0.1,0,0.1
      c0.2,0,0.4,0,0.6,0c0-0.6,0-1.2,0-1.8c0.4,0,0.8,0,1.2,0c0,0.8,0,1.6,0,2.4c0.2,0,0.4,0,0.6,0c0-0.6,0-1.2,0-1.8c0.2,0,0.4,0,0.6,0
      c0,0,0,0,0,0.1c0,0.1,0,0.3,0,0.4c0,0,0,0.1,0,0.1c0.2,0,0.4,0,0.5,0c0,0,0.1,0,0.1,0.1c0,0.2,0,0.5,0,0.7c0,1.1,0,2.3,0,3.4
      c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0c0,0,0,0,0,0c0,0.6,0,1.1,0,1.7c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0c0,0.4,0,0.8,0,1.2
      c-1.6,0-3.2,0-4.9,0c0-0.4,0-0.8,0-1.2c-0.2,0-0.4,0-0.6,0C-2,3.8-2,3.4-2,3c-0.2,0-0.4,0-0.6,0c0,0.4,0,0.8,0,1.2
      c0.2,0,0.4,0,0.6,0C-2,4.8-2,5.4-2,6c2,0,4.1,0,6.1,0c0-0.1,0-0.2,0-0.3c0-0.5,0-0.9,0-1.4c0-0.1,0-0.1,0-0.2c0.2,0,0.4,0,0.5,0
      c0.1,0,0.1,0,0.1-0.1c0-0.4,0-0.9,0-1.3c0-0.1,0-0.3,0-0.4c0.1,0,0.2,0,0.3,0c0.1,0,0.2,0,0.3,0c0-1.4,0-2.8,0-4.3
      c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c-0.4,0-0.8,0-1.2,0c0-0.2,0-0.4,0-0.6
      c-0.1,0-0.2,0-0.3,0c-0.4,0-0.9,0-1.3,0C1.2-3.7,1.1-3.7,1.1-3.7z M-3.2,1.8c0,0.4,0,0.8,0,1.2c0.2,0,0.4,0,0.5,0
      c0.1,0,0.1,0,0.1-0.1c0-0.3,0-0.6,0-1c0-0.1,0-0.1,0-0.2C-2.8,1.8-3,1.8-3.2,1.8c0-0.4,0-0.8,0-1.2c-0.2,0-0.4,0-0.6,0
      c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0,0,0,0,0.1c0,0.1,0,0.3,0,0.4c0,0.2,0,0.5,0,0.7
      c0,0,0,0.1,0.1,0.1c0.1,0,0.2,0,0.3,0C-3.4,1.8-3.3,1.8-3.2,1.8z"/>
    <path id="XMLID_172_" fill="#FFFFFF" d="M4.1,4.2C4.1,4.2,4.1,4.2,4.1,4.2c0-0.6,0-1.2,0-1.8c0,0,0,0,0,0c0.2,0,0.4,0,0.6,0
      c0,0,0-0.1,0-0.1c0-1.1,0-2.3,0-3.4c0-0.2,0-0.5,0-0.7c0,0,0-0.1-0.1-0.1c-0.2,0-0.4,0-0.5,0c0,0,0-0.1,0-0.1c0-0.1,0-0.3,0-0.4
      c0,0,0-0.1,0-0.1c-0.2,0-0.4,0-0.6,0c0,0.6,0,1.2,0,1.8c-0.2,0-0.4,0-0.6,0c0-0.8,0-1.6,0-2.4c-0.4,0-0.8,0-1.2,0
      c0,0.6,0,1.2,0,1.8c-0.2,0-0.4,0-0.6,0c0,0,0-0.1,0-0.1c0-0.7,0-1.5,0-2.2c0,0,0-0.1,0-0.1l0,0c0.1,0,0.2,0,0.2,0
      c0.4,0,0.9,0,1.3,0c0.1,0,0.2,0,0.3,0c0,0.2,0,0.4,0,0.6c0.4,0,0.8,0,1.2,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6
      c0.2,0,0.4,0,0.6,0c0,1.4,0,2.8,0,4.3c-0.1,0-0.2,0-0.3,0c-0.1,0-0.2,0-0.3,0c0,0.1,0,0.3,0,0.4c0,0.4,0,0.9,0,1.3
      c0,0.1,0,0.1-0.1,0.1C4.5,4.2,4.3,4.2,4.1,4.2L4.1,4.2z"/>
    <path id="XMLID_171_" fill="#FFFFFF" d="M4.1,4.2c0,0.1,0,0.1,0,0.2c0,0.5,0,0.9,0,1.4c0,0.1,0,0.2,0,0.3C2.1,6,0,6-2,6
      c0-0.6,0-1.2,0-1.8c-0.2,0-0.4,0-0.6,0c0-0.4,0-0.8,0-1.2C-2.4,3-2.2,3-2,3c0,0.4,0,0.8,0,1.2c0.2,0,0.4,0,0.6,0c0,0.4,0,0.8,0,1.2
      c1.6,0,3.2,0,4.9,0c0-0.4,0-0.8,0-1.2C3.7,4.2,3.9,4.2,4.1,4.2L4.1,4.2z"/>
    <path id="XMLID_170_" fill="#FFFFFF" d="M-2-6.8c0,0.6,0,1.2,0,1.8c0,1.6,0,3.2,0,4.8c0,0,0,0,0,0.1c-0.2,0-0.4,0-0.6,0
      c0-0.2,0-0.4,0-0.6c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6l0,0c0.1,0,0.1,0,0.2,0c0.1,0,0.3,0,0.4,0c0-1.8,0-3.6,0-5.5
      c0,0,0.1,0,0.1,0C-2.4-6.8-2.2-6.8-2-6.8C-2.1-6.8-2-6.8-2-6.8L-2-6.8z"/>
    <path id="XMLID_169_" fill="#FFFFFF" d="M1.1-3.7C1-3.7,1-3.7,1.1-3.7c-0.4,0-0.8,0-1.2,0c0,0,0,0-0.1,0c0,0.8,0,1.6,0,2.4
      c-0.2,0-0.4,0-0.6,0c0-1.8,0-3.6,0-5.5c0.2,0,0.4,0,0.6,0c0,0.8,0,1.6,0,2.4c0.4,0,0.8,0,1.2,0c0,0,0,0.1,0,0.1C1-4.1,1-4,1.1-3.7
      C1-3.8,1-3.8,1.1-3.7L1.1-3.7z"/>
    <path id="XMLID_168_" fill="#FFFFFF" d="M-3.2,1.8c-0.1,0-0.2,0-0.3,0c-0.1,0-0.2,0-0.3,0c0,0-0.1,0-0.1-0.1c0-0.2,0-0.5,0-0.7
      c0-0.1,0-0.3,0-0.4c0,0,0,0,0-0.1c-0.2,0-0.4,0-0.6,0c0-0.2,0-0.4,0-0.6c0.2,0,0.4,0,0.6,0c0,0.2,0,0.4,0,0.6c0.2,0,0.4,0,0.6,0
      C-3.2,0.9-3.2,1.3-3.2,1.8c0.2,0,0.4,0,0.6,0c0,0.1,0,0.1,0,0.2c0,0.3,0,0.6,0,1C-2.6,3-2.7,3-2.7,3c-0.2,0-0.3,0-0.5,0
      C-3.2,2.6-3.2,2.2-3.2,1.8z"/>
    <path id="XMLID_167_" fill="#FFFFFF" d="M-3.2-1.3c-0.1,0-0.2,0-0.3,0c-0.3,0-0.5,0-0.8,0c0,0,0,0-0.1,0c0,0.4,0,0.8,0,1.2
      c-0.2,0-0.4,0-0.6,0c0-0.6,0-1.2,0-1.8c0.6,0,1.2,0,1.8,0c0,0.1,0,0.2,0,0.3C-3.2-1.5-3.2-1.4-3.2-1.3L-3.2-1.3z"/>
    <path id="XMLID_166_" fill="#FFFFFF" d="M-2-6.8C-2-7-2-7.2-2-7.4c0.4,0,0.8,0,1.2,0c0,0.2,0,0.4,0,0.6c-0.1,0-0.1,0-0.2,0
      C-1.3-6.8-1.6-6.8-2-6.8C-2-6.8-2-6.8-2-6.8L-2-6.8z"/>
  </g>
</svg>

<dt-article class="centered">
  <h1>Why Momentum Really Works</h1>

  <figure style = "position:relative; width:984px; height:400px;">
    <div id="banana" style="position:relative; border: 1px solid rgba(0, 0, 0, 0.2);"></div>
    <div id="sliderAlpha" style="position:absolute; width:300px; height: 50px; left:20px; top: 320px;">
      <text class="figtext" style="top: -5px; left: 20px; position: relative;">Step-size α = 0.02</text>
    </div>
    <div id="sliderBeta" style="position:absolute; width: 300px; height: 50px; left: 280px; top: 320px;;">
      <text class="figtext" style="top: -5px; left: 20px; position: relative;">Momentum β = 0.99</text>
    </div>
    <figcaption id="Bananacaption" style="position:absolute; width: 420px; height: 90px; left: 540px; top: 320px;">
      We often think of Momentum as a means of dampening oscillations and speeding up the iterations, leading to faster convergence. But it has other interesting behavior. It allows a larger range of step-sizes to be used, and creates its own oscillations. What is going on?
    </figcaption>
  </figure>

  <dt-byline class="l-page"></dt-byline>

  <script src="assets/lib/contour_plot.js"></script>
  <script src="assets/iterates.js"></script>
  <script>

  // Render Foreground
  var iterControl = genIterDiagram(bananaf, [1,1/3], [[-2,2],[2/3 + 0.4,-2/3 + 0.4]])
                    .alpha(0.003)
                    .beta(0)
                    (d3.select("#banana").style("position","relative"))

  var iterChange = iterControl.control
  var getw0 = iterControl.w0

  var StepRange = d3.scaleLinear().domain([0,100]).range([0,0.0062])
  var MomentumRange = d3.scaleLinear().domain([0,100]).range([0,0.98])

  var update = function (i,j) { iterChange(i, 0, getw0()) }

  var slidera = sliderGen([230, 40])
              .ticks([0,0.003,0.006])
              .ticktitles( function(d,i) { return ["0", "0.003", "0.006"][i]})
              .change( function (i) {
                d3.select("#sliderAlpha").selectAll(".figtext").html("Step-size α = " + getalpha().toPrecision(2) )
                iterChange(getalpha(), getbeta(), getw0() )
              } )
              .startxval(0.003)
              .cRadius(7)
              .shifty(-12)
              .margins(20,20)

  var sliderb = sliderGen([230, 40])
              .ticks([0,0.5,0.99])
              .change( function (i) {
                d3.select("#sliderBeta").selectAll(".figtext").html("Momentum β = " + getbeta().toPrecision(2) )
                iterChange(getalpha(), getbeta(), getw0() )
              } )
              .cRadius(7)
              .shifty(-12)
              .startxval(0.74)
              .margins(20,20)

  var getalpha = slidera( d3.select("#sliderAlpha")).xval
  var getbeta  = sliderb( d3.select("#sliderBeta")).xval

  iterChange(getalpha(), getbeta(), getw0() )

  </script>
  <p>
<!--   The Momentum algorithm <dt-cite key="sutskever2013importance,polyak1964some,rutishauser1959theory"></dt-cite>, also known as the heavy ball method, can sometimes seem inseparable from the physics which has inspired it. It is common to think of gradient descent as a man walking down a hill, and momentum is a heavy ball rolling down it. Momentum grants iterates inertial energy, dampening oscillations between steep valleys, reinforcing directions which vary smoothly, allowing it to plough through narrow valleys, small humps, ridges and local minima.

  The Momentum algorithm has a very natural physical interpretation. Where gradient descent walks down a hill along the steepest path, momentum is a heavy ball rolling down, ploughing through narrow valleys, small humps, ridges and local minima.

  </p>
  <p>
  This story, now a staple of any exposition of the subject, is unfortunately a cartoon. And though it isn't wrong, it is an oversimplification, failing to account for many of momentum's important properties. It does not explain, for example, how much momentum should be used. Momentum works by overshooting the target, and relies on the gradient's corrective forces to set it back on track. But are these wasted iterations really worth it? And if so, how much of a speedup can one expect? Momentum also has the effect of increasing the range of permissible step-sizes. But what are the limits of this expansion, and why does it happen?
  </p> -->
  <p>
  Here’s a popular story about momentum <dt-cite key="sutskever2013importance,polyak1964some,rutishauser1959theory"></dt-cite>: gradient descent is a man walking down a hill. He follows the steepest path downwards; his progress is slow, but steady. Momentum is a heavy ball rolling down the same hill. The added inertia acts both as a smoother and an accelerator, dampening oscillations and causing us to barrel through narrow valleys, small humps and local minima.
  </p>
  <p>
  This standard story isn’t wrong, but it fails to explain many important behaviors of momentum. In fact, momentum can be understood far more precisely if we study it on the right model.
  </p>
  <p>
  One nice model is the convex quadratic. This model is rich enough to reproduce momentum’s local dynamics in real problems, and yet simple enough to be understood in closed form. This balance gives us powerful traction for understanding this algorithm.
  </p>
  <hr>
  <p>
  We begin with gradient descent. The algorithm has many virtues, but speed is not one of them. It is simple -- when optimizing a smooth function <dt-math>f</dt-math>, we make a small step in the gradient

  <dt-math block>w^{k+1} = w^k-\alpha\nabla f(w^k).</dt-math>

  For a step-size small enough, gradient descent makes a monotonic improvement at every iteration. It always converges, albeit to a local minimum. And under a few weak curvature conditions it can even get there at an exponential rate.
  </p>

  <p>
  But the exponential decrease, though appealing in theory, can often be infuriatingly small. Things often begin quite well -- with an impressive, almost immediate decrease in the loss. But as the iterations progress, things start to slow down. You start to get a nagging feeling you're not making as much progress as you should be. What has gone wrong?
  </p>

  <p>
  The problem could be the optimizer's old nemesis, pathological curvature. Pathological curvature is, simply put, regions of <dt-math>f</dt-math> which aren't scaled properly. The landscapes are often described as valleys, trenches, canals and ravines. The iterates either jump between valleys, or approach the optimum in small, timid steps. Progress along certain directions grind to a halt. In these unfortunate regions, gradient descent fumbles.</p>
  <p>
  Momentum proposes the following tweak to gradient descent. We give gradient descent a short-term memory:

  <dt-math block>
  \begin{aligned}
  z^{k+1}&=\beta z^{k}+\nabla f(w^{k})\\[0.4em]
  w^{k+1}&=w^{k}-\alpha z^{k+1}
  \end{aligned}
  </dt-math>

  The change is innocent, and costs almost nothing. When <dt-math>\beta = 0</dt-math> , we recover gradient descent. But for <dt-math>\beta = 0.99</dt-math> (sometimes <dt-math>0.999</dt-math>, if things are really bad), this appears to be the boost we need. Our iterations regain that speed and boldness it lost, speeding to the optimum with a renewed energy.

  </p>
  <p>
  Optimizers call this minor miracle "acceleration".
  </p>

  <p>
  The new algorithm may seem at first glance like a cheap hack. A simple trick to get around gradient descent's more aberrant behavior -- a smoother for oscillations between steep canyons. But the truth, if anything, is the other way round. It is gradient descent which is the hack. First, momentum gives up to a quadratic speedup on many functions. <dt-fn> It is possible, however, to construct very specific counterexamples where momentum does not converge, even on convex functions. See <dt-cite key="lessard2016analysis"></dt-cite> for a counterexample. </dt-fn> This is no small matter -- this is similar to the speedup you get from the Fast Fourier Transform, Quicksort, and Grover's Algorithm. When the universe gives you quadratic speedups, you should start to pay attention.
  </p>

  <p>
  But there's more. A lower bound, courtesy of Nesterov <dt-cite key="nesterov2013introductory"></dt-cite>, states that momentum is, in a certain very narrow and technical sense, optimal. Now, this doesn't mean it is the best algorithm for all functions in all circumstances. But it does satisfy some curiously beautiful mathematical properties which scratch a very human itch for perfection and closure. But more on that later. Let's say this for now -- momentum is an algorithm for the book.
  </p>

<hr>
  <h2>First Steps: Gradient Descent</h2>
  <p>
  We begin by studying gradient descent on the simplest model possible which isn't trivial -- the convex quadratic,

  <dt-math block>
    f(w) = \tfrac{1}{2}w^TAw - b^Tw, \qquad w \in \mathbf{R}^n.
  </dt-math>

  Assume <dt-math>A</dt-math> is symmetric and invertible, then the optimal solution <dt-math>w^{\star}</dt-math> occurs at

  <dt-math block> w^{\star} = A^{-1}b.</dt-math>

  Simple as this model may be, it is rich enough to approximate many functions (think of <dt-math>A</dt-math> as your favorite model of curvature -- the Hessian, Fisher Information Matrix <dt-cite key="amari1998natural"></dt-cite>, etc) and captures all the key features of pathological curvature. And more importantly, we can write an exact closed formula for gradient descent on this function.
  </p>

  <p>
  This is how it goes. Since <dt-math>\nabla f(w)=Aw - b</dt-math>, the iterates are

  <dt-math block>
  w^{k+1}=w^{k}- \alpha (Aw^{k} - b).
  </dt-math>

  Here's the trick. There is a very natural space to view gradient descent where all the dimensions act independently -- the eigenvectors of <dt-math>A</dt-math>.
  </p>
  <figure style = "width:750px; height:340px; display:block; margin-left:auto; margin-right:auto; position:relative" id = "change_of_variables">
  <div id = "mom1" style="width:400px; position:absolute; left:0px; top:0px"></div>
  <div id = "mom2" style="width:400px; position:absolute; left:400px; top:0px"></div>
  </svg>
  </figure>
  <script>
  deleteQueue.push(renderLoading(d3.select("#change_of_variables")))
  renderQueue.push(function(callback) {
    var U = givens(Math.PI/4)
    var Ut = numeric.transpose(U)
    // Render Foreground
    var left = d3.select("#mom1").style("border", "1px solid rgba(0, 0, 0, 0.2)")

    var c1 = genIterDiagram(quadf,  [0,0], [[-3,3],[-3,3]])
              .width(340)
              .height(340)
              .iters(300)
              .alpha(0.018)
              .showSolution(false)
              .pathWidth(1)
              .circleRadius(1.5)
              .pointerScale(0.8)
              .showStartingPoint(false)
              .drag(function() {
                c2.control(c1.alpha(),
                          c1.beta(),
                          numeric.dot(U,c1.w0())) })
              (left)

    var right = d3.select("#mom2").style("border", "1px solid rgba(0, 0, 0, 0.2)")
    var c2 = genIterDiagram(eyef,  [0,0], [[-3,3],[-3,3]])
              .width(340)
              .height(340)
              .iters(300)
              .alpha(0.018)
              .showSolution(false)
              .pathWidth(1)
              .circleRadius(1.5)
              .pointerScale(0.8)
              .showStartingPoint(false)
              .drag(function() {
                c1.control(c2.alpha(),
                          c2.beta(),
                          numeric.dot(Ut,c2.w0())) })
              (right)

  // Initialize
  c2.control(0.018,0,[-2.5,1])
  c1.control(0.018,0,numeric.dot(Ut,[-2.5,1]));
  callback(null);
});

</script>
<p>
  Every symmetric matrix <dt-math>A</dt-math> has an eigenvalue decomposition

  <dt-math block>
  A=Q\ \text{diag}(\lambda_{1},\ldots,\lambda_{n})\ Q^{T},\qquad Q = [q_1,\ldots,q_n],
  </dt-math>

  and, as per convention, we will assume that the <dt-math>\lambda_i</dt-math>'s are sorted, from smallest <dt-math>\lambda_1</dt-math> to biggest <dt-math>\lambda_n</dt-math>. If we perform a change of basis, <dt-math>x^{k} = Q^T(w^{k} - w^\star)</dt-math>, the iterations break apart, becoming:

  <dt-math block>
  \begin{aligned}
  x_{i}^{k+1} & =x_{i}^{k}-\alpha \lambda_ix_{i}^{k} \\[0.4em]
   &= (1-\alpha\lambda_i)x^k_i=(1-\alpha \lambda_i)^{k+1}x^0_i
  \end{aligned}
  </dt-math>

  Moving back to our original space <dt-math>w</dt-math>, we can see that

  <dt-math block>
  w^k - w^\star = Qx^k=\sum_i^n x^0_i(1-\alpha\lambda_i)^k q_i
  </dt-math>

  and there we have it -- gradient descent in closed form.
  </p>
  </p>
  <h3>Decomposing the Error</h3>
  <p>
  The above equation admits a simple interpretation. Each element of <dt-math>x^0</dt-math> is the component of the error in the initial guess in the <dt-math>Q</dt-math>-basis. There are <dt-math>n</dt-math> such errors, and each of these errors follows its own, solitary path to the minimum, decreasing exponentially with a compounding rate of <dt-math>1-\alpha\lambda_i</dt-math>. The closer that number is to <dt-math>1</dt-math>, the slower it converges.
  </p>
  <p>
  For most step-sizes, the eigenvectors with largest eigenvalues converge the fastest. This triggers an explosion of progress in the first few iterations, before things slow down as the smaller eigenvectors' struggles are revealed. By writing the contributions of each eigenspace's error to the loss
  <dt-math block>
  f(w^{k})-f(w^{\star})=\sum(1-\alpha\lambda_{i})^{2k}\lambda_{i}[x_{i}^{0}]^2
  </dt-math>
  we can visualize the contributions of each error component to the loss.
  </p>
  <figure style="position:relative; width:920px; height:360px" id = "milestones_gd">
  <figcaption style="position:absolute; text-align:left; left:135px; width:350px; height:80px">Optimization can be seen as combination of several component problems, shown here as <svg style="position:relative; top:2px; width:3px; height:14px; background:#fde0dd"></svg> 1 <svg style="position:relative; top:2px; width:3px; height:14px; background:#fa9fb5"></svg> 2 <svg style="position:relative; top:2px; width:3px; height:14px; background:#c51b8a"></svg> 3 with eigenvalues <svg style="position:relative; top:2px; width:3px; height:14px; background:#fde0dd"></svg> <dt-math>\lambda_1=0.01</dt-math>, <svg style="position:relative; top:2px; width:3px; height:14px; background:#fa9fb5"></svg> <dt-math>\lambda_2=0.1</dt-math>, and <svg style="position:relative; top:2px; width:3px; height:14px; background:#c51b8a"></svg> <dt-math>\lambda_3=1</dt-math> respectively. </figcaption>

<!-- ["#fde0dd", "#fa9fb5", "#c51b8a"]
 -->
  <div id = "sliderStep" style="position:absolute; left:550px; width:250px; height:100px">

    <div id="stepSizeMilestones"
         class="figtext"
         style="position:absolute; left:15px; top:15px">
         Step-size
    </div>

    <div class="figtext2" id="milestones_gd_optstep"
      style="position:absolute; font-size:11px; left:152px; top:18px; z-index:10; cursor: pointer">
      Optimal Step-size
    </div>

    <svg style="position:absolute; font-size:10px; left:224px; top:34px">
<line marker-end="url(#arrowhead)" style="stroke: black; stroke-width: 1.5; visibility: visible;" x2="5" y2="10" x1="5" y1="0"></line>
    </svg>

  </div>
  <div id = "obj"></div>
  </figure>
  <script src="assets/milestones.js"></script>
  <script>
    deleteQueue.push(renderLoading(d3.select("#milestones_gd")))
    renderQueue.push(function(callback) {
      var graphDiv = d3.select("#obj")
                      .style("width",  920 + "px")
                      .style("height", 300 + "px")
                      .style("top", "90px")
                      .style("position", "relative")
                      .style("margin-left", "auto")
                      .style("margin-right", "auto")
                      .attr("width", 920)
                      .attr("height", 500)

      var svg = graphDiv.append("svg")
                        .attr("width", 920)
                        .attr("height", 300)
                        .style("position","absolute")
                        .style("left", "15px")

      var updateSliderGD = renderMilestones(svg, function() {});

      var alphaHTML = MathCache("alpha-equals");

      var slidera = sliderGen([250, 80])
                  .ticks([0,1,200/(101),2])
                  .change( function (i) {
                    var html = alphaHTML + '<span style="font-weight: normal;">' + i.toPrecision(4) + "</span>";
                    d3.select("#stepSizeMilestones")
                      .html("Stepsize " + html )
                    updateSliderGD(i,0.000)
                  } )
                  .ticktitles(function(d,i) { return [0,1,"",2][i] })
                  .startxval(200/(101))
                  .cRadius(7)
                  .shifty(-12)
                  .shifty(10)
                  .margins(20,20)(d3.select("#sliderStep"))


      // renderDraggable(svg, [133.5, 23], [114.5, 90], 2, " ").attr("opacity", 0.1)
      // renderDraggable(svg, [133.5, 88], [115.5, 95], 2, " ").attr("opacity", 0.1)
      // renderDraggable(svg, [132.5, 154], [114.5, 100], 2, " ").attr("opacity", 0.1)

      d3.select("#milestones_gd_optstep").on("click", slidera.init)

      svg.append("text")
        .attr("class", "katex morsd mathit")
        .style("font-size", "19px")
        .style("font-family","KaTeX_Math")
        .attr("x", 105)
        .attr("y", 50)
        .attr("text-anchor", "end")
        .attr("fill", "gray")
        .html("f(w<tspan baseline-shift = \"super\" font-size = \"15\">k</tspan>) - f(w<tspan baseline-shift = \"super\" font-size = \"15\">*</tspan>)")


      svg.append("text")
        .style("font-size", "13px")
        .attr("x", 0)
        .attr("y", 80)
        .attr("dy", 0)
        .attr("transform", "translate(110,0)")
        .attr("class", "caption")
        .attr("text-anchor", "end")
        .attr("fill", "gray")
        .text("At the initial point, the error in each component is equal.")

      svg.selectAll(".caption").call(wrap, 100)


      svg.append("text")
        .style("font-size", "13px")
        .attr("x", 420)
        .attr("y", 270)
        .attr("dy", 0)
        .attr("dx", -295)
        .attr("text-anchor", "start")
        .attr("fill", "gray")
        .text("At the optimum, the rates of convergence of the largest and smallest eigenvalues equalize.")

      callback(null);
    });
  </script>
  <p>
 <h3>Choosing A Step-size</h3>
  <p>
  The above analysis gives us immediate guidance as to how to set a step-size <dt-math>\alpha</dt-math>. In order to converge, each <dt-math>|1-\alpha \lambda_i|</dt-math> must be strictly less than 1. All workable step-sizes, therefore, fall in the interval

  <dt-math block>0<\alpha\lambda_i<2.</dt-math>

  The overall convergence rate is determined by the slowest error component, which must be either <dt-math>\lambda_1</dt-math> or <dt-math>\lambda_n</dt-math>:
  <dt-math block>
  \begin{aligned}\text{rate}(\alpha) & ~=~ \max_{i}\left|1-\alpha\lambda_{i}\right|\\[0.9em] & ~=~ \max\left\{|1-\alpha\lambda_{1}|,~ |1-\alpha\lambda_{n}|\right\} \end{aligned}
  </dt-math>
  </p>
  <p>
  This overall rate is minimized when the rates for <dt-math>\lambda_1</dt-math> and <dt-math>\lambda_n</dt-math> are the same -- this mirrors our informal observation in the previous section that the optimal step-size causes the first and last eigenvectors to converge at the same rate. If we work this through we get:

  <dt-math block>
  \begin{aligned}
  \text{optimal }\alpha ~=~{\mathop{\text{argmin}}\limits_\alpha} ~\text{rate}(\alpha) & ~=~\frac{2}{\lambda_{1}+\lambda_{n}}\\[1.4em]
  \text{optimal rate} ~=~{\min_\alpha} ~\text{rate}(\alpha) & ~=~\frac{\lambda_{n}/\lambda_{1}-1}{\lambda_{n}/\lambda_{1}+1}
  \end{aligned}
  </dt-math>
  </p>
  <p>
  Notice the ratio <dt-math>\lambda_n/\lambda_1</dt-math> determines the convergence rate of the problem. In fact, this ratio appears often enough that we give it a name, and a symbol -- the condition number.
  <dt-math block>
  \text{condition number} := \kappa :=\frac{\lambda_n}{\lambda_1}
  </dt-math>
  The condition number means many things. It is a measure of how close to singular a matrix is. It is a measure of how robust <dt-math>A^{-1}b</dt-math> is to perturbations in <dt-math>b</dt-math>. And, in this context, the condition number gives us a measure of how poorly gradient descent will perform. A ratio of <dt-math>\kappa = 1</dt-math> is ideal, giving convergence in one step (of course, the function is trivial). Unfortunately the larger the ratio, the slower gradient descent will be. The condition number is therefore a direct measure of pathological curvature.
  </p>

<hr>

  <h2>Example: Polynomial Regression</h2>
  <p>
  The above analysis reveals an insight: all errors are not made equal. Indeed, there are different kinds of errors, <dt-math>n</dt-math> to be exact, one for each of the eigenvectors of <dt-math>A</dt-math>. And gradient descent is better at correcting some kinds of errors than others. But what do the eigenvectors of <dt-math>A</dt-math> mean? Surprisingly, in many applications they admit a very concrete interpretation.
  </p>

  <p>
  Lets see how this plays out in polynomial regression. Given 1D data, <dt-math>\xi_i</dt-math>, our problem is to fit the model

  <dt-math block>
\text{model}(\xi)=w_{1}p_{1}(\xi)+\cdots+w_{n}p_{n}(\xi)\qquad p_{i}=\xi\mapsto\xi^{i-1}
  </dt-math>

  to our observations, <dt-math>d_i</dt-math>. This model, though nonlinear in the input <dt-math>\xi</dt-math>, is linear in the weights, and therefore we can write the model as a linear combination of monomials, like:
  </p>
  <figure id = "poly0f" style="width:940px; height:200px">
    <div id = "poly0" style="width:940px; height:185px; position:absolute; top:20px"></div>
  </figure>
  <script src="assets/eigensum.js"></script>
  <script>
  deleteQueue.push(renderLoading(d3.select("#poly0")))
  renderQueue.push(function(callback) {

    // Preprocess x, get eigendecomposition, etc
    var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
    var D = vandermonde(x, 5)
    var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
    var U = Eigs.U
    var lambda = Eigs.lambda

    // Preprocess y
    var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]
    var Dtb = numeric.dot(b,D)
    var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))

    var step = 1.8/lambda[0]
    var iter = geniter(U, lambda, Dtb, step)

    var eigensum = d3.select("#poly0")

    var wi = [-2,-2,2,2,2,-2]

    function refit(b) {
      var Dtb = numeric.dot(b,D)
      var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
      var Utsol = numeric.dot(sol,U)
      eigenControl.updateweights(Utsol)
    }

    var eigenControl = renderEigenPanel(eigensum, numeric.identity(6), x, b, wi, refit);

    // Swoopy Annotator
    var annotations = [
    {
      "x": 0,
      "y": 0,
      "path": "M 60,5 A 19.018 19.018 0 0 0 36,27",
      "text": "scrub values",
      "textOffset": [
        64,
        9
      ]
    }
    ]

    drawAnnotations(d3.select("#poly0f"), annotations)

    callback(null);
  });

  </script>
  <p>

  Because of the linearity, we can fit this model to our data <dt-math>\xi_i</dt-math> using linear regression on the model mismatch

  <dt-math block>
  \text{minimize}_w \qquad\tfrac{1}{2}\sum_i (\text{model}(\xi_{i})-d_{i})^{2} ~~=~~ \tfrac{1}{2}\|Zw - d\|^2
  </dt-math>
  where
  <dt-math block>
  Z=\left(\begin{array}{ccccc}
  1 & \xi_{1} & \xi_{1}^{2} & \ldots & \xi_{1}^{n-1}\\
  1 & \xi_{2} & \xi_{2}^{2} & \ldots & \xi_{2}^{n-1}\\
  \vdots & \vdots & \vdots & \ddots & \vdots\\
  1 & \xi_{m} & \xi_{m}^{2} & \ldots & \xi_{m}^{n-1}
  \end{array}\right).
  </dt-math>
  </p>

  <p>
  The path of convergence, as we know, is elucidated when we view the iterates in the space of <dt-math>Q</dt-math> (the eigenvectors of <dt-math>Z^T Z</dt-math>). So let's recast our regression problem in the basis of <dt-math>Q</dt-math>. First, we do a change of basis, by rotating <dt-math>w</dt-math> into <dt-math>Qw</dt-math>, and counter-rotating our feature maps <dt-math>p</dt-math> into eigenspace, <dt-math>\bar{p}</dt-math>. We can now conceptualize the same regression as one over a different polynomial basis, with the model

  <dt-math block>
  \text{model}(\xi)~=~x_{1}\bar{p}_{1}(\xi)~+~\cdots~+~x_{n}\bar{p}_{n}(\xi)\qquad \bar{p}_{i}=\sum q_{ij}p_j.
  </dt-math>

  This model is identical to the old one. But these new features <dt-math>\bar{p}</dt-math> (which I call "eigenfeatures") and weights have the pleasing property that each coordinate acts independently of the others. Now our optimization problem breaks down, really, into <dt-math>n</dt-math> small 1D optimization problems. And each coordinate can be optimized greedily and independently, one at a time in any order, to produce the final, global, optimum. The eigenfeatures are also much more informative:
  </p>


  <figure id = "poly1" style="width:940px; height:285px"></figure>

  <script>
  deleteQueue.push(renderLoading(d3.select("#poly1")))
  renderQueue.push(function(callback) {
    var inv = function(lambda) { return 1/lambda }
    var scal = function(lambda) { return lambda < 1e-10 ? -100 : 1.5/Math.sqrt(lambda) }

    // Preprocess x, get eigendecomposition, etc
    var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
    var D = vandermonde(x, 5)
    var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
    var U = Eigs.U
    var lambda = Eigs.lambda

    // Preprocess y
    var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]
    var Dtb = numeric.dot(b,D)
    var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))

    var step = 1.8/lambda[0]
    var iter = geniter(U, lambda, Dtb, step)

    var eigensum = d3.select("#poly1")

    var wi = lambda.slice(0).map(scal)

    function refit(b) {
      var Dtb = numeric.dot(b,D)
      var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))
      var Utsol = numeric.dot(sol,U)
      eigenControl.updateweights(sol)
    }

    var eigenControl = renderEigenPanel(eigensum, U, x, b, wi, refit, true)

    var annotate = eigensum

    annotate.append("figcaption")
    .style("width", 230 + "px")
    .style("height", 150 + "px")
    .style("left", "0px")
    .style("position", "absolute")
      .style("padding", "10px")
    .html("The data comes in 2 clusters. The first 2 eigenfeatures capture variations between the clusters. ")

    annotate.append("figcaption")
    .style("width", 230 + "px")
    .style("height", 150 + "px")
    .style("left", "260px")
    .style("position", "absolute")
    .style("padding", "10px")
    .html("Next there are smooth variations within clusters, peaks within clusters,")

    annotate.append("figcaption")
    .style("width", 230 + "px")
    .style("height", 150 + "px")
    .style("left", 530 + "px")
    .style("position", "absolute")
    .style("padding", "10px")
    .html("and finally, jagged polynomials which differ wildly on neighboring points. ");

    // Swoopy Annotator
    var annotations = [
    {
      "x": 0,
      "y": 0,
      "path": "M 807,198 A 26.661 26.661 0 0 1 838,159",
      "text": "drag points to fit data",
      "textOffset": [
        799,
        214
      ]
    }]

    drawAnnotations(eigensum, annotations)
    callback(null);
  });

  </script>
  <p>
  The observations in the above diagram can be justified mathematically. From a statistical point of view, we would like a model which is, in some sense, robust to noise. Our model cannot possibly be meaningful if the slightest perturbation to the observations changes the entire model dramatically. And the eigenfeatures, the principal components of the data, give us exactly the decomposition we need to sort the features by its sensitivity to perturbations in <dt-math>d_i</dt-math>'s. The most robust components appear in the front (with the largest eigenvalues), and the most sensitive components in the back (with the smallest eigenvalues).
  </p>

  <p>
  This measure of robustness, by a rather convenient coincidence, is also a measure of how easily an eigenspace converges. And thus, the "pathological directions" -- the eigenspaces which converge the slowest -- are also those which are most sensitive to noise! So starting at a simple initial point like <dt-math>0</dt-math> (by a gross abuse of language, let's think of this as a prior), we track the iterates till a desired level of complexity is reached. Let's see how this plays out in gradient descent.
  </p>

  <figure id = "poly2" style="width:940px; height:360px"></figure>

  <script>
  deleteQueue.push(renderLoading(d3.select("#poly2")))
  renderQueue.push(function(callback) {
    var inv = function(lambda) { return 1/lambda }
    var scal = function(lambda) { return lambda < 1e-10 ? -100 : 1.5/Math.sqrt(lambda) }

    // Preprocess x, get eigendecomposition, etc
    var x = [-0.6, -0.55,-0.5,-0.45,-0.4,0.4,0.45,0.5,0.55,0.6]
    var b = [-3/2,-4/2,-5/2,-3/2,-2/2,1/2,2/2,3/2,2/2,1/2]

    var D = vandermonde(x, 5)
    var Eigs = eigSym(numeric.dot(numeric.transpose(D),D))
    var U = Eigs.U
    var lambda = Eigs.lambda

    // Preprocess y
    var Dtb = numeric.dot(b,D)
    var sol = numeric.mul(numeric.dot(U, Dtb), lambda.map(inv))

    var step = 1.8/lambda[0]
    var iter = geniter(U, lambda, Dtb, step)

    var eigensum = d3.select("#poly2")

    var wi = lambda.slice(0).map(scal)

    function refit(b) {
      var Dtb = numeric.dot(b,D)
      iter = geniter(U, lambda, Dtb, step)
      onChange(sliderControl.slidera.xval())
    }

    var eigenControl = renderEigenPanel(eigensum, U, x, b, wi, refit, true)

    var barlengths = getStepsConvergence(lambda, step).map(Math.log)

    var onChange = function(i) {
      eigenControl.updateweights(numeric.dot(U,iter(Math.floor(Math.exp(i-0.1)) )))
    }

    var sliderControl = sliderBarGen(barlengths, function(i) {return Math.exp(i-0.1)}).update(onChange)(d3.select("#poly2"))

    d3.select("#poly2").append("figcaption")
          .style("width", "120px")
          .style("position", "absolute")
          .style("left", "820px")
          .style("top","200px")
          .html("When an eigenspace has converged to three significant digits, the bar greys out. Drag the observations to change fit.")

    sliderControl.slidera.init()

    // var figwidth = d3.select("#poly2").style("width")
    // var figheight = d3.select("#poly2").style("height")
    // var svgannotate = d3.select("#poly2")
    //                     .append("svg")
    //                     .style("width", figwidth)
    //                     .style("height", figheight)
    //                     .style("position", "absolute")
    //                     .style("top","0px")
    //                     .style("left","0px")
    //                     .style("pointer-events","none")

    // renderDraggable(svgannotate,
    //                 [139.88888549804688, 243.77951049804688],
    //                 [121.88888549804688, 200.77951049804688],
    //                 5,
    //                 "We begin at x=w=0");

    // Swoopy Annotator
    var annotations = [
      {
        "x": 0,
        "y": 0,
        "path": "M 74,202 A 52.274 52.274 0 0 0 134,245",
        "text": "We begin at x=w=0",
        "textOffset": [
          21,
          198
        ]
      }
    ]

    drawAnnotations(d3.select("#poly2"), annotations)


    callback(null);
  });

  </script>
  <p>
  This effect is harnessed with the heuristic of early stopping : by stopping the optimization early, you can often get better generalizing results. Indeed, the effect of early stopping is very similar to that of more conventional methods of regularization, such as Tikhonov Regression. Both methods try to suppress the components of the smallest eigenvalues directly, though they employ different methods of spectral decay.<dt-fn>In Tikhonov Regression we add a quadratic penalty to the regression, minimizing
<dt-math block>
\text{minimize}\qquad\tfrac{1}{2}\|Zw-d\|^{2}+\frac{\eta}{2}\|w\|^{2}=\tfrac{1}{2}w^{T}(Z^{T}Z+\eta I)w-(Zd)^{T}w
</dt-math>
Recall that <dt-math>Z^{T}Z=Q\ \text{diag}(\Lambda_{1},\ldots,\Lambda_{n})\ Q^T</dt-math>. The solution to Tikhonov Regression is therefore
<dt-math block>
(Z^{T}Z+\eta I)^{-1}(Zd)=Q\ \text{diag}\left(\frac{1}{\lambda_{1}+\eta},\cdots,\frac{1}{\lambda_{n}+\eta}\right)Q^T(Zd)
</dt-math>
We can think of regularization as a function which decays the largest eigenvalues, as follows:
<dt-math block>
\text{Tikhonov Regularized } \lambda_i = \frac{1}{\lambda_{i}+\eta}=\frac{1}{\lambda_{i}}\left(1-\left(1+\lambda_{i}/\eta\right)^{-1}\right).
</dt-math>
Gradient descent can be seen as employing a similar decay, but with the decay rate
<dt-math block> \text{ Gradient Descent Regularized } \lambda_i = \frac{1}{\lambda_i} \left( 1-\left(1-\alpha\lambda_{i}\right)^{k} \right)</dt-math>
instead. Note that this decay is dependent on the step-size.
</dt-fn> But early stopping has a distinct advantage. Once the step-size is chosen, there are no regularization parameters to fiddle with. Indeed, in the course of a single optimization, we have the entire family of models, from underfitted to overfitted, at our disposal. This gift, it seems, doesn't come at a price. A beautiful free lunch <dt-cite key="hintonNIPS"></dt-cite> indeed.
  </p>

<hr>
  <h2>The Dynamics of Momentum</h2>

  <p>
  Let's turn our attention back to momentum. Recall that the momentum update is

  <dt-math block>
  \begin{aligned}
  z^{k+1}&=\beta z^{k}+\nabla f(w^{k})\\[0.4em]
  w^{k+1}&=w^{k}-\alpha z^{k+1}.
  \end{aligned}
  </dt-math>

  Since <dt-math>\nabla f(w^k) = Aw^k - b</dt-math>, the update on the quadratic is

  <dt-math block>
  \begin{aligned}
  z^{k+1}&=\beta z^{k}+ (Aw^{k}-b)\\[0.4em]
  w^{k+1}&=w^{k}-\alpha z^{k+1}.
  \end{aligned}
  </dt-math>

  Following <dt-cite key="o2015adaptive"></dt-cite>, we go through the same motions, with the change of basis <dt-math>
  x^{k} = Q(w^{k} - w^\star)</dt-math> and <dt-math> y^{k} = Qz^{k}</dt-math>, to yield the update rule

  <dt-math block>
  \begin{aligned}
  y_{i}^{k+1}&=\beta y_{i}^{k}+\lambda_{i}x_{i}^{k}\\[0.4em]
  x_{i}^{k+1}&=x_{i}^{k}-\alpha y_{i}^{k+1}.
  \end{aligned}
  </dt-math>

  in which each component acts independently of the other components (though <dt-math>x^k_i</dt-math> and <dt-math>y^k_i</dt-math> are coupled). This lets us rewrite our iterates as

  <dt-fn>
  This is true as we can write updates in matrix form as
  <dt-math block>
  \left(\!\!\begin{array}{cc}
  1 & 0\\
  \alpha & 1
  \end{array}\!\!\right)\Bigg(\!\!\begin{array}{c}
  y_{i}^{k+1}\\
  x_{i}^{k+1}
  \end{array}\!\!\Bigg)=\left(\!\!\begin{array}{cc}
  \beta & \lambda_{i}\\
  0 & 1
  \end{array}\!\!\right)\left(\!\!\begin{array}{c}
  y_{i}^{k}\\
  x_{i}^{k}
  \end{array}\!\!\right)
  </dt-math>

  which implies, by inverting the matrix on the left,
  <dt-math block>
  \Bigg(\!\!\begin{array}{c}
  y_{i}^{k+1}\\
  x_{i}^{k+1}
  \end{array}\!\!\Bigg)=\left(\!\!\begin{array}{cc}
  \beta & \lambda_{i}\\
  -\alpha\beta & 1-\alpha\lambda_{i}
  \end{array}\!\!\right)\left(\!\!\begin{array}{c}
  y_{i}^{k}\\
  x_{i}^{k}
  \end{array}\!\!\right)=R^{k+1}\left(\!\!\begin{array}{c}
  x_{i}^{0}\\
  y_{i}^{0}
  \end{array}\!\!\right)
  </dt-math>

  </dt-fn>
  <dt-math block>
  \left(\!\!\begin{array}{c}
  y_{i}^{k}\\
  x_{i}^{k}
  \end{array}\!\!\right)=R^k\left(\!\!\begin{array}{c}
  y_{i}^{0}\\
  x_{i}^{0}
  \end{array}\!\!\right)
  \qquad
  R = \left(\!\!\begin{array}{cc}
  \beta & \lambda_{i}\\
  -\alpha\beta & 1-\alpha\lambda_{i}
  \end{array}\!\!\right).
  </dt-math>
  There are many ways of taking a matrix to the <dt-math>k^{th}</dt-math> power. But for the <dt-math>2 \times 2</dt-math> case there is an elegant and little known formula <dt-cite key="williamsnthpower"></dt-cite> in terms of the eigenvalues of <dt-math>R</dt-math>, <dt-math>\sigma_1</dt-math> and <dt-math>\sigma_2</dt-math>.

  <dt-math block>
\color{#AAA}{\color{black}{R^{k}}=\begin{cases}
\color{black}{\sigma_{1}^{k}}R_{1}-\color{black}{\sigma_{2}^{k}}R_{2} & \sigma_{1}\neq\sigma_{2}\\
\sigma_{1}^{k}(kR/\sigma_1-(k-1)I) & \sigma_{1}=\sigma_{2}
\end{cases},\qquad R_{j}=\frac{R-\sigma_{j}I}{\sigma_{1}-\sigma_{2}}}
  </dt-math>

  This formula is rather complicated, but the takeaway here is that it plays the exact same role the individual convergence rates, <dt-math>1-\alpha\lambda_i</dt-math> do in gradient descent. But instead of one geometric series, we have two coupled series, which may have real or complex values. The convergence rate is therefore the slowest of the two rates, <dt-math>\max\{|\sigma_{1}|,|\sigma_{2}|\}</dt-math>
  <dt-fn>
We can write out the convergence rates explicitly. The eigenvalues are
<dt-math block>
\begin{aligned}
\sigma_{1} & =\frac{1}{2}\left(1-\alpha\lambda+\beta+\sqrt{(-\alpha\lambda+\beta+1)^{2}-4\beta}\right)\\[0.6em]
\sigma_{2} & =\frac{1}{2}\left(1-\alpha\lambda+\beta-\sqrt{(-\alpha\lambda+\beta+1)^{2}-4\beta}\right)
\end{aligned}
</dt-math>
When the <dt-math>(-\alpha\lambda+\beta+1)^{2}-4\beta<0</dt-math> is less than zero,
then the roots are complex and the convergence rate is
<dt-math block>
\begin{aligned}
|\sigma_{1}|=|\sigma_{2}| & =\sqrt{(1-\alpha\lambda+\beta)^{2}+|(-\alpha\lambda+\beta+1)^{2}-4\beta|}=2\sqrt{\beta}
\end{aligned}
</dt-math>
Which is, surprisingly, independent of the step-size or the eigenvalue <dt-math>\alpha\lambda</dt-math>. When the roots are real, the convergence rate is
<dt-math block>
\max\{|\sigma_{1}|,|\sigma_{2}|\}=\tfrac{1}{2}\max\left\{ |1-\alpha\lambda_{i}+\beta\pm\sqrt{(1-\alpha\lambda_{i}+\beta)^{2}-4\beta}|\right\}
</dt-math>

 </dt-fn>. By plotting this out, we see there are distinct regions of the parameter space which reveal a rich taxonomy of convergence behavior <dt-cite key="flammarion2015averaging"></dt-cite>:
  </p>

  <figure id = "momentum2D" style="width:984px; height:540px">
    <div class = "l-body" style="display:block">
      <div id = "momentumCanvas" style="position:absolute; left:45px"></div>
      <div id = "momentumAnnotation" style="position:absolute; width: 204px; height: 80px; left: 630px; top: 20px;"></div>
      <div style="position:absolute; width: 204px; height: 80px; left: 643px; top: 10px;" class ="figtext" >
        Convergence Rate

      </div>

      <figcaption style="position:absolute; width: 204px; height: 80px; left: 645px; top: 86px;">
      A plot of <dt-math>\max\{|\sigma_1|, |\sigma_2|\}</dt-math> reveals distinct regions, each with its own style of convergence.
      </figcaption>

    </div>
    <div id = "taxonomy"></div>
    <svg id="momentumOverlay" style="position:absolute; width:984px; height:540px; z-index:4; pointer-events:none"></svg>
  </figure>
  <script src="assets/momentum.js"></script>
  <script>
  deleteQueue.push(renderLoading(d3.select("#momentum2D")))
  renderQueue.push(function(callback) {
    var defaults = [[0.0015, 0.9],
                    [0.0015, 0.125],
                    [0.01, 0.00001],
                    [0.02, 0.05   ],
                    [0.025, 0.235 ]]

    coor = render2DSliderGen(
      function(a,b,bold) {
        var xy = coor(a,b)
        updatePaths[0](xy[0], xy[1],bold)
        updateStemGraphs[0](a,b)
      },
      function(a,b,bold) {
        var xy = coor(a,b)
        updatePaths[1](xy[0], xy[1],bold)
        updateStemGraphs[1](a,b)
      },
      function(a,b,bold) {
        var xy = coor(a,b)
        updatePaths[2](xy[0], xy[1],bold)
        updateStemGraphs[2](a,b)
      },
      function(a,b,bold) {
        var xy = coor(a,b)
        updatePaths[3](xy[0], xy[1],bold)
        updateStemGraphs[3](a,b)
      },
      function(a,b,bold) {
        var xy = coor(a,b)
        updatePaths[4](xy[0], xy[1],bold)
        updateStemGraphs[4](a,b)
      }, defaults)(d3.select("#momentumCanvas"))

    var tax = renderTaxonomy(d3.select("#momentum2D"))

    var updatePaths = renderOverlay(d3.select("#momentumOverlay"), tax.div)
    var updateStemGraphs = tax.update

    colorMap(
      d3.select("#momentumAnnotation"),
      180,
      d3.scaleLinear().domain([0,0.3,0.5,0.7,1,1.01]).range(colorbrewer.YlGnBu[5].concat(["black"])),
      d3.scaleLinear().domain([0,1.2001]).range([0, 180])
    )

    var up = function (i, alpha, beta) {
              var xy = coor(alpha, beta)
              updatePaths[i](xy[0], xy[1], true)
              updateStemGraphs[i](alpha,beta)
            }

    for (var i = 0; i<5; i++) {
      up(i,defaults[i][0], defaults[i][1])
    }

    renderMath(document.getElementById("momentum2D"))

    callback(null);
  });
  </script>
  <p>

  For what values of <dt-math>\alpha</dt-math> and <dt-math>\beta</dt-math> does momentum converge? Since we need both <dt-math>\sigma_1</dt-math> and <dt-math>\sigma_2</dt-math> to converge, our convergence criterion is now <dt-math>\max\{|\sigma_{1}|,|\sigma_{2}|\} < 1</dt-math>. The range of available step-sizes work out <dt-fn>This can be derived by reducing the inequalities for all 4 + 1 cases in the explicit form of the convergence rate above.</dt-fn> to be

  <dt-math block>0<\alpha\lambda_{i}<2+2\beta \qquad \text{for} \qquad 0 \leq \beta < 1</dt-math>

  We recover the previous result for gradient descent when <dt-math>\beta = 0</dt-math>. But notice an immediate boon we get. Momentum allows us to crank up the step-size up by a factor of 2 before diverging.
  </p>
  <hr>
  <h3>The Critical Damping Coefficient</h3>

  <p>
  The true magic happens, however, when we find the sweet spot of <dt-math>\alpha</dt-math> and <dt-math>\beta</dt-math>. Let us try to first optimize over <dt-math>\beta</dt-math>.
  </p>

  <p>
  Momentum admits an interesting physical interpretation when <dt-math>\alpha</dt-math> is <dt-cite key="qian1999momentum"></dt-cite> small: it is a discretization of a damped harmonic oscillator. Consider a physical simulation operating in discrete time (like a video game).
  </p>

  <figure id = "momentum_annotations" style="width:530px; height:150px; display:block; margin-left:auto; margin-right:auto; position:relative">

  <div style="position:relative; top:-35px">

  <span style="position:absolute; left:0px; top:0px">
  <dt-math block>y_{i}^{k+1}</dt-math>
  </span>

  <span style="position:absolute; left:160px; top:0px">
  <dt-math block>=</dt-math>
  </span>


  <span style="position:absolute;left: 355px;top:0px;">
  <dt-math block>+</dt-math>
  </span>

  <span style="position:absolute; left:410px; top:0px">
  <dt-math block>\lambda_{i}x_{i}^{k}</dt-math>
  </span>

  <figcaption style="position:absolute;left:410px;top: 60px;width:150px;">
  and perturbed by an external force field
  </figcaption>

  <figcaption style="position:absolute;left:0px;top: 60px;width:100px;">
  We can think of <dt-math>-y_i^k</dt-math> as <b>velocity</b>
  </figcaption>

  <span style="position:absolute; left:200px; top:0px">
  <dt-math block>\beta y_{i}^{k}</dt-math>
  </span>

  <figcaption style="position:absolute;left:200px;top: 60px;width:130px;">
  which is dampened at each step
  </figcaption>


  <span style="position:absolute;left: 0px;top: 100px;">
  <dt-math block>x_i^{k+1}</dt-math>
  </span>

  <span style="position:absolute;left:160px;top: 100px;">
  <dt-math block>=</dt-math>
  </span>

  <span style="position:absolute;left:200px;top: 100px;">
  <dt-math block>x_i^k - \alpha y_i^{k+1}</dt-math>
  </span>

  <figcaption style="position:absolute;left:0px;top:155px;width:120px;">
  And <dt-math>x</dt-math> is our particle's <b>position</b>
  </figcaption>

  <figcaption style="position:absolute;left:200px;top: 155px;width:280px;">
  which is moved at each step by a small amount in the direction of the velocity <dt-math>y^{k+1}_i</dt-math>.
  </figcaption>


  </div>
  </figure>

  <script>
  renderMath(document.getElementById("momentum_annotations"))
  </script>
  <p>
  We can break this equation apart to see how each component affects the dynamics of the system. Here we plot, for <dt-math>150</dt-math> iterates, the particle's velocity (the horizontal axis) against its position (the vertical axis), in a phase diagram.
  </p>

  <script src = "assets/phasediagram_description.js"></script>
  <figure id = "phasediagram0" style="position:relative; width:970px; height:540px;margin-left:auto; margin-right:auto">
  <div id = "phasediagram0div"; style="position:absolute; left:80px; top:10px"></div>
  </figure>
  <script>
    deleteQueue.push(renderLoading(d3.select("#phasediagram0")))
    renderQueue.push(function(callback) {
      phaseDiagram_dec(d3.select("#phasediagram0div"))
      renderMath(d3.select("#phasediagram0div").node())

      var figure = d3.select("#phasediagram0")
      var figwidth = figure.style("width")
      var figheight = figure.style("height")

      var svg = figure.append("svg")
                .style("width", figwidth)
                .style("height", figheight)
                .style("position", "absolute")
                .style("top","0px")
                .style("left","0px")
                .style("pointer-events", "none")

      renderDraggable(svg, [178.5, 66], [199.5, 66], 4, "Initial point <tspan x=\"0\" dy=\"1.2em\">x = 1, y = 1</tspan>").attr("opacity", 0.5)
      // renderDraggable(svg, [185, 116], [234, 174], 4, "constant speed").attr("opacity", 0.5)
      // renderDraggable(svg, [470, 86], [445, 71], 4, "slows to a halt").attr("opacity", 0.5)
      // renderDraggable(svg, [740, 64], [718, 97], 4, "all energy lost in 1 step").attr("opacity", 0.5)
      // renderDraggable(svg, [219, 269], [246, 327], 4, "particle returns slowly").attr("opacity", 0.5)
      // renderDraggable(svg, [250, 433], [287, 468], 4, "particle returns quickly").attr("opacity", 0.5)
      callback(null);
      renderMath(document.getElementById("phasediagram0"))

    });
  </script>
  <p>
  This system is best imagined as a weight suspended on a spring. We pull the weight down by one unit, and we study the path it follows as it returns to equilibrium. In the analogy, the spring is the source of our external force <dt-math>\lambda_ix^k_i</dt-math>, and equilibrium is the state when both the position <dt-math>x^k_i</dt-math> and the speed <dt-math>y^k_i</dt-math> are 0. The choice of <dt-math>\beta</dt-math> crucially affects the rate of return to equilibrium.
  </p>
  <figure id = "phasediagram1" style="position:relative; width:648px; height:490px; margin-left: auto; margin-right: auto"></figure>
  </div>
  <script src="assets/phasediagram.js"></script>
  <script>
    deleteQueue.push(renderLoading(d3.select("#phasediagram1")))
    renderQueue.push(function(callback) {
      phaseDiagram(d3.select("#phasediagram1"));
      renderMath(document.getElementById("phasediagram1"))
      callback(null);
    });
  </script>
  <p>

  </p>
  <p>
  The critical value of <dt-math>\beta = (1 - \sqrt{\alpha \lambda_i})^2</dt-math> gives us a convergence rate (in eigenspace <dt-math>i</dt-math>) of <dt-math>1 - \sqrt{\alpha\lambda_i}.</dt-math> A square root improvement over gradient descent, <dt-math>1-\alpha\lambda_i</dt-math>! Alas, this only applies to the error in the <dt-math>i^{th}</dt-math> eigenspace, with <dt-math>\alpha</dt-math> fixed.
  </p>

  <h3>Optimal parameters</h3>
  <p>
  To get a global convergence rate, we must optimize over both <dt-math>\alpha</dt-math> and <dt-math>\beta</dt-math>. This is a more complicated affair,<dt-fn> We must optimize over
  <dt-math block>
  \min_{\alpha,\beta}\max\left\{ \bigg\| \! \left(\begin{array}{cc}
  \beta & \lambda_{i}\\
  -\alpha\beta & 1-\alpha\lambda_{i}
  \end{array}\right) \! \bigg\|,\ldots,\bigg\| \! \left(\begin{array}{cc}
  \beta & \lambda_{n}\\
  -\alpha\beta & 1-\alpha\lambda_{n}
  \end{array}\right)\! \bigg\|\right\}.
  </dt-math>
  (<dt-math>\|\cdot \|</dt-math> here denotes the magnitude of the maximum eigenvalue), and occurs when the roots of the characteristic polynomial are repeated for the matrices corresponding to the extremal eigenvalues. </dt-fn> but they work out to be
  <dt-math block>
  \alpha = \left(\frac{2}{\sqrt{\lambda_{1}}+\sqrt{\lambda_{n}}}\right)^{2}  \quad \beta = \left(\frac{\sqrt{\lambda_{n}}-\sqrt{\lambda_{1}}}{\sqrt{\lambda_{n}}+\sqrt{\lambda_{1}}}\right)^{2}
  </dt-math>
  Plug this into the convergence rate, and you get
  </p>
  <figure id = "conv_rate_comparisons" style="width:530px; height:19px; display:block; margin-left:auto; margin-right:auto; position:relative">
  <div style="position:relative; top:-40px">

  <span style="position:absolute; left:16px">
  <dt-math block>\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}</dt-math>
  </span>

  <figcaption style="position:absolute;left: 116px;top: 29px;width:130px;">
  Convergence rate, <b>Momentum</b>
  </figcaption>

  <span style="position:absolute; left:313px">
  <dt-math block> \frac{\kappa-1}{\kappa+1}</dt-math>
  </span>


  <figcaption style="position:absolute;left: 393px;top: 29px;width:150px;">
  Convergence rate, <b>Gradient Descent</b>
  </figcaption>
  </div>
  </figure>
  <script>
  renderMath(document.getElementById("conv_rate_comparisons"))
  </script>
  <p>
  With barely a modicum of extra effort, we have essentially square rooted the condition number! These gains, in principle, require explicit knowledge of <dt-math>\lambda_1</dt-math> and <dt-math>\lambda_n</dt-math>. But the formulas reveal a simple guideline. When the problem's conditioning is poor, the optimal <dt-math>\alpha</dt-math> is approximately twice that of gradient descent, and the momentum term is close to <dt-math>1</dt-math>. So set <dt-math>\beta</dt-math> as close to <dt-math>1</dt-math> as you can, and then find the highest <dt-math>\alpha</dt-math> which still converges. Being at the knife's edge of divergence, like in gradient descent, is a good place to be.
  </p>

  <figure id="milestonesMomentumFig" style="position:relative; width:920px; height:460px">
  <figcaption style="position:absolute; text-align:left; top:30px; left:135px; width:280px; height:120px">We can do the same decomposition here with momentum, with eigenvalues
    <span style="display: inline-block; padding-left: 4px; border-left: 3px solid #fde0dd"><dt-math>\lambda_1=0.01</dt-math></span>,
    <span style="display: inline-block; padding-left: 4px; border-left: 3px solid #fa9fb5"><dt-math>\lambda_2=0.1</dt-math></span>, and
    <span style="display: inline-block; padding-left: 4px; border-left: 3px solid #c51b8a"><dt-math>\lambda_3=1</dt-math></span>. Though the decrease is no longer monotonic, but significantly faster. </figcaption>
  <figcaption style="position:absolute; text-align:left; top:210px; left:15px; width:280px; height:120px; font-size:15px"> <dt-math>f(w^k) - f(w^\star)</dt-math></figcaption>
  <figcaption style="position:absolute; text-align:left; top:430px; left:140px; width:780px; height:120px"> Note that the optimal parameters do not necessarily imply the fastest convergence, though, only the fastest asymptotic convergence rate. </figcaption>

  <div class="figtext" style="position:absolute; left:705px; top:11px">Step-size α = </div>
  <div class="figtext" style="position:absolute; left:464px; top:30px">Momentum β = </div>

  <div id = "sliderStep2D" style="position:absolute; left:550px; width:250px; height:100px; top:10px"></div>
  <div id = "milestonesMomentum" style="position:relative; left:15px"></div>
  </figure>
  <script>
    deleteQueue.push(renderLoading(d3.select("#milestonesMomentumFig")))
    renderQueue.push(function(callback) {
      var graphDiv = d3.select("#milestonesMomentum")
                   .style("width",  920 + "px")
                   .style("height", 300 + "px")
                   .style("top", "170px")
                   .style("position", "relative")
                   .style("margin-left", "auto")
                   .style("margin-right", "auto")
                   .attr("width", 920)
                   .attr("height", 500)

    var svg = graphDiv.append("svg").attr("width", 940).attr("height", 500)

    var update = renderMilestones(svg, function() {})
    var reset = slider2D(d3.select("#sliderStep2D"),
                           function(x,y) { update(x,y) },
                           1,
                           100)

    // // Swoopy Annotator
    annotations = [
      {
        "x": 0,
        "y": 0,
        "path": "M 798,98 A 27.97 27.97 0 0 0 760.9999389648438,70.9999771118164",
        "text": "Optimal parameters",
        "textOffset": [
          740,
          109
        ]
      }
    ]
    var sel = drawAnnotations(d3.select("#milestonesMomentumFig"), annotations)
    sel.selectAll("text").style("cursor", "pointer").style("pointer-events","all").on("click", reset)
    callback(null);
  });

  </script>
  <p>
  While the loss function of gradient descent had a graceful, monotonic curve, optimization with momentum displays clear oscillations. These ripples are not restricted to quadratics, and occur in all kinds of functions in practice. They are not cause for alarm, but are an indication that extra tuning of the hyperparameters is required.
  </p>
  <hr>
  <h2>Example: The Colorization Problem</h2>

  <p>
  Let's look at how momentum accelerates convergence with a concrete example. On a grid of pixels let <dt-math>G</dt-math> be the graph with vertices as pixels, <dt-math>E</dt-math> be the set of edges connecting each pixel to its four neighboring pixels, and <dt-math>D</dt-math> be a small set of a few distinguished vertices. Consider the problem of minimizing
  </p>
  <figure id = "colorizer_equation" style="width:530px; height:95px; display:block; margin-left:auto; margin-right:auto; position:relative">
  <div style="position:relative; top:-40px">
  <span style="position:absolute; top:10px">
  <dt-math block>\text{minimize} </dt-math>
  </span>

  <span style="position:absolute; left:90px">
    <dt-math block>\qquad  \frac{1}{2} \sum_{i\in D} (w_i - 1)^2 </dt-math>
  </span>

  <figcaption style="position:absolute; left:140px; top:100px; width:130px">
  The <b>colorizer</b> pulls distinguished pixels towards 1
  </figcaption>

  <span style="position:absolute; left:310px; top:10px">
    <dt-math block>+</dt-math>
  </span>

  <span style="position:absolute; left:350px">
    <dt-math block>\frac{1}{2} \sum_{i,j\in E} (w_i - w_j)^2.</dt-math>
  </span>


  <figcaption style="position:absolute; left:353px; top:100px; width:150px">
  The <b>smoother</b> spreads out the color
  </figcaption>
  </div>
  </figure>
  <script>
  renderMath(document.getElementById("colorizer_equation"))
  </script>
  <p>
  The optimal solution to this problem is a vector of all <dt-math>1</dt-math>'s <dt-fn>The above optimization problem is bounded from below by <dt-math>0</dt-math>, and vector of all <dt-math>1</dt-math>'s achieve this. </dt-fn>. An inspection of the gradient iteration reveals why we take a long time to get there. The gradient step, for each component, is some form of weighted average of the current value and its neighbors:

  <dt-math block>
w_{i}^{k+1}=w_{i}^{k}-\alpha\sum_{j\in N}(w_{i}^{k}-w_{j}^{k})-\begin{cases}
\alpha(w_{i}^{k}-1) & i\in D\\
0 & i\notin D
\end{cases}
  </dt-math>

  This kind of local averaging is effective at smoothing out local variations in the pixels, but poor at taking advantage of global structure. The updates are akin to a drop of ink, diffusing through water. Movement towards equilibrium is made only through local corrections and so, left undisturbed, its march towards the solution is slow and laborious. Fortunately, momentum speeds things up significantly.
  </p>
  <figure id = "flow" style="position: relative;display: block;margin-left: auto;margin-right: auto;width: 920px;height: 705px;">

  <figcaption style="left:790px; top:430px; position:absolute; width:130px">
      The eigenvectors of the colorization problem form a generalized Fourier basis for <dt-math>R^n</dt-math>. The smallest eigenvalues have low frequencies, hence gradient descent corrects high frequency errors well but not low frequency ones.
  </figcaption>

  </figure>
  <script src="assets/flow.js"></script>
  <script>
    deleteQueue.push(renderLoading(d3.select("#flow")))
    renderQueue.push(function(callback) {
      d3.queue()
        .defer(d3.json, "assets/data/Sigma.json")
        .defer(d3.json, "assets/data/matrix.json")
        .defer(d3.json, "assets/data/Uval.json")
        .await(function(error, FlowSigma, M, FlowU) {
          if (error) {
            console.error("Error loading data files");
          }
          else {
            var reset = renderFlowWidget(d3.select("#flow"), FlowSigma, M, FlowU)
            // Swoopy Annotator
            var annotations = [
              {
                "x": 0,
                "y": 0,
                "path": "M 389,55 A 28.57 28.57 0 0 0 352,30",
                "text": "Optimal parameters",
                "textOffset": [
                  341,
                  65
                ]
              }
            ]
            var sel = drawAnnotations(d3.select("#flow"), annotations)
            sel.selectAll("text").style("cursor", "pointer").style("pointer-events","all").on("click", reset)
          }
        });
      callback(null);
    });
  </script>
  <p>
  In vectorized form, the colorization problem is
  </p>

  <figure style="width:550px; height:95px; display:block; margin-left:auto; margin-right:auto; position:relative">

  <div style="position:relative; top:-35px">

  <span style="position:absolute; left:0px; top:10px">
    <dt-math block>\text{minimize}</dt-math>
  </span>

  <figcaption style="position:absolute; left:130px; top:90px; width:140px;">
  The <b>smoother</b>'s quadratic form is the <b>Graph Laplacian</b>
  </figcaption>

  <span style="position:absolute; left:310px; top:0px">
    <dt-math block>\frac{1}{2}\sum_{i\in D}\left(x^{T}e_{i}e_{i}^{T}x-e_{i}^{T}x\right)</dt-math>
  </span>


  <span style="position:absolute; left:255px; top:10px">
    <dt-math block>+</dt-math>
  </span>

  <span style="position:absolute; left:130px; top:0px">
    <dt-math block>\frac{1}{2}x^{T}L_{G}x</dt-math>
  </span>

  <figcaption style="position:absolute; left: 310px; top: 90px;width:250px;">
  And the colorizer is a small low rank correction with a linear term. <dt-math>e_i</dt-math> is the <dt-math>i^{th}</dt-math> unit vector.
  </figcaption>
  </div>
  </figure>
  <p>
  The Laplacian matrix, <dt-math>L_G</dt-math>
  <dt-fn>This can be written explicitly as
  <dt-math block>
  [L_{G}]_{ij}=\begin{cases} \text{degree of vertex }i & i=j\\ -1 & i\neq j,(i,j)\text{ or }(j,i)\in E\\ 0 & \text{otherwise} \end{cases}
  </dt-math>
  </dt-fn>, which dominates the behavior of the optimization problem, is a valuable bridge between linear algebra and graph theory. This is a rich field of study, but one fact is pertinent to our discussion here. The conditioning of <dt-math>L_G</dt-math>, here defined as the ratio of the second eigenvector to the last (the first eigenvalue is always 0, with eigenvector equal to the matrix of all 1's), is directly connected to the connectivity of the graph.
  </p>
  <figure id="laplacianConditioning" style="width:900px; height:265px">
  <svg width="900" height="300" id="graph"></svg>
  <figcaption id = "expander" style="position:absolute; left:70px; top:210px; width:200px">Small world graphs, like expanders and dense graphs, have excellent conditioning</figcaption>

  <figcaption id = "expander" style="position:absolute; left:370px; top:210px; width:200px">The conditioning of grids improves with its dimensionality.</figcaption>

  <figcaption id = "expander" style="position:absolute; left:670px; top:210px; width:200px">And long, wiry graphs, like paths, condition poorly. </figcaption>

  </figure>
  <script src="assets/graph.js"></script>
  <script>
    deleteQueue.push(renderLoading(d3.select("#laplacianConditioning")))
    renderQueue.push(function(callback) {
      var g1 = d3.select("#graph").append("g")
      genExpander(g1)

      var g2 = d3.select("#graph").append("g").attr("transform", "translate(300,0)")
      genGrid(g2)

      var g3 = d3.select("#graph").append("g").attr("transform", "translate(600,0)")
      genPath(g3);
      callback(null);
    });
  </script>
  <p>

  These observations carry through to the colorization problem, and the intuition behind it should be clear. Well connected graphs allow rapid diffusion of information through the edges, while graphs with poor connectivity do not. And this principle, taken to the extreme, furnishes a class of functions so hard to optimize they reveal the limits of first order optimization.
  </p>
<hr>
  <h2>
  The Limits of Descent
  </h2>
  <p>
  Let's take a step back. We have, with a clever trick, improved the convergence of gradient descent by a quadratic factor with the introduction of a single auxiliary sequence. But is this the best we can do? Could we improve convergence even more with two sequences? Could one perhaps choose the <dt-math>\alpha</dt-math>'s and <dt-math>\beta</dt-math>'s intelligently and adaptively? It is tempting to ride this wave of optimism - to the cube root and beyond!
  </p>
  <p>
  Unfortunately, while improvements to the momentum algorithm do exist, they all run into a certain, critical, almost inescapable lower bound.
  </p>
  <h3>Adventures in Algorithmic Space</h3>
  <p>
  To understand the limits of what we can do, we must first formally define the algorithmic space in which we are searching. Here's one possible definition. The observation we will make is that both gradient descent and momentum can be "unrolled". Indeed, since
  <dt-math block>
  \begin{array}{lll}
    w^{1} & \!= & \!w^{0} ~-~ \alpha\nabla f(w^{0})\\[0.35em]
    w^{2} & \!= & \!w^{1} ~-~ \alpha\nabla f(w^{1})\\[0.35em]
          & \!= & \!w^{0} ~-~ \alpha\nabla f(w^{0}) ~-~ \alpha\nabla f(w^{1})\\[0.35em]
     & ~ \!\vdots \\

   w^{k+1} & \!= & \!w^{0} ~-~ \alpha\nabla f(w^{0}) ~-~~~~ \cdots\cdots ~~~~-~ \alpha\nabla f(w^{k})
  \end{array}
  </dt-math>
  we can write gradient descent as

  <dt-math block>
  w^{k+1} ~~=~~ w^{0} ~-~ \alpha\sum_i^k\nabla f(w^{i}).
  </dt-math>

  A similar trick can be done with momentum:

  <dt-math block>
  w^{k+1} ~~=~~ w^{0} ~+~ \alpha\sum_i^k\frac{(1-\beta^{k+1-i})}{1-\beta}\nabla f(w^i).
  </dt-math>

  In fact, all manner of first order algorithms, including the Conjugate Gradient algorithm, AdaMax, Averaged Gradient and more, can be written (though not quite so neatly) in this unrolled form. Therefore the class of algorithms for which

  <dt-math block>
  w^{k+1} ~~=~~ w^{0} ~+~ \sum_{i}^{k}\gamma_{i}^{k}\nabla f(w^{i}) \qquad \text{ for some } \gamma_{i}^{k}
  </dt-math>

  contains momentum, gradient descent and a whole bunch of other algorithms you might dream up. This is what is assumed in Assumption 2.1.4 <dt-cite key="nesterov2013introductory"></dt-cite> of Nesterov. But let's push this even further, and expand this class to allow different step-sizes for different directions.

  <dt-math block>
  w^{k+1} ~~=~~ w^{0} ~+~ \sum_{i}^{k}\Gamma_{i}^{k}\nabla f(w^{i}) \quad \text{ for some diagonal matrix } \Gamma_{i}^{k} .
  </dt-math>

  This class of methods covers most of the popular algorithms for training neural networks, including ADAM and AdaGrad. We shall refer to this class of methods as "Linear First Order Methods", and we will show a single function all these methods ultimately fail on.

  </p>
  <h3>The Resisting Oracle</h3>
  <p>
  Earlier, when we talked about the colorizer problem, we observed that wiry graphs cause bad conditioning in our optimization problem. Taking this to its extreme, we can look at a graph consisting of a single path -- a function so badly conditioned that Nesterov called a variant of it "the worst function in the world". The function follows the same structure as the colorizer problem, and we shall call this the Convex Rosenbrock,
  </p>
  <figure style="width:610px; height:65px; display:block; margin-left:auto; margin-right:auto; position:relative">

  <div style="position:relative; top:-35px">

  <span style="position:absolute; left:0px; top:9px">
    <dt-math block>f^n(w)</dt-math>
  </span>

  <span style="position:absolute; left:72px; top:9px">
    <dt-math block>=</dt-math>
  </span>

  <figcaption style="position:absolute; left:105px; top:90px; width:120px;">
  with a colorizer of one node
  </figcaption>

  <span style="position:absolute; left:105px; top:0px">
    <dt-math block>\frac{1}{2}\left(w_{1}-1\right)^{2}</dt-math>
  </span>

  <span style="position:absolute; left:240px; top:6px">
    <dt-math block>+</dt-math>
  </span>

  <span style="position:absolute; left:275px; top:-10px">
    <dt-math block>\frac{1}{2}\sum_{i=1}^{n}(w_{i}-w_{i+1})^{2}</dt-math>
  </span>

  <figcaption style="position:absolute; left: 275px; top:90px;width:210px;">
  strong couplings of adjacent nodes in the path,
  </figcaption>

  <span style="position:absolute; left:475px; top:6px">
    <dt-math block>+</dt-math>
  </span>
  <span style="position:absolute; left:505px; top:-2px">
    <dt-math block>\frac{2}{\kappa-1}\|w\|^{2}.</dt-math>
  </span>

  <figcaption style="position:absolute; left:505px; top:90px;width:150px;">
  and a small regularization term.
  </figcaption>

  </div>
  </figure>
  <p>
  The optimal solution of this problem is

  <dt-math block>
  w_{i}^{\star}=\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^{i}
  </dt-math>

  and the condition number of the problem <dt-math>f^n</dt-math> approaches <dt-math>\kappa</dt-math> as <dt-math>n</dt-math> goes to infinity. Now observe the behavior of the momentum algorithm on this function, starting from <dt-math>w^0 = 0</dt-math>.
  </p>

  <figure id="rosenViz" style="width:950px; height:630px">
    <div id = "sliderz" style="position:absolute;top: 14px;left:575px;"></div>
    <div class="figtext" style="position:absolute; pointer-events:none; top:14px; width:300px; left:729px; height:100px">Step-size α = </div>
    <div class="figtext" style="position:absolute; pointer-events:none; top:35px; width:488px; left:488px; height:100px">Momentum β = </div>

    <figcaption style="width:270px; position:absolute; left:150px; top:38px">Here we see the first 50 iterates of momentum on the Convex Rosenbrock for <dt-math>n=25</dt-math>. The behavior here is similar to that of any Linear First Order Algorithm. </figcaption>

    <div style="position:absolute; top:150px; left:-80px">
    <figcaption style="width:160px; position:absolute; left:670px; top:90px">This triangle is a "dead zone" of our iterates. The iterates are always 0, no matter what the parameters. </figcaption>
    <figcaption style="width:160px; position:absolute; left:670px; top:310px">The remaining expanding space is the "light cone" of our iterate's influence. Momentum does very well here with the optimal parameters. </figcaption>
    <div id = "iterates" style="position:absolute; top:60px; left:0"></div>
    <div class="figtext" style="position:absolute; top:-2px; width:300px; left:452px; height:100px">Error</div>
    <div id = "rosen_colorbar1" style="position:absolute; top:0px; width:300px; left:440px; height:100px"></div>
    <div class="figtext" style="position:absolute; top:-2px; width:300px; left:232px; height:100px">Weights</div>
    <div id = "rosen_colorbar2" style="position:absolute; top:0px; width:300px; left:220px; height:100px"></div>
    </div>
  </figure>
  <script>
  deleteQueue.push(renderLoading(d3.select("#rosenViz")))
  renderQueue.push(function(callback) {
    var RQ = [[0.033430859446525574,0.06637421995401382,0.09834969788789749,0.1288910210132599,0.1575528085231781,0.18391713500022888,0.20759953558444977,0.22825466096401215,0.24558131396770477,0.25932684540748596,0.2692908048629761,0.2753278911113739,0.2773500978946686,0.2753278911113739,0.2692908048629761,0.25932684540748596,0.24558131396770477,0.22825466096401215,0.20759953558444977,0.18391713500022888,0.1575528085231781,0.1288910210132599,0.09834969788789749,0.06637421995401382,0.033430859446525574],[-0.06637421995401382,-0.1288910210132599,-0.18391713500022888,-0.22825466096401215,-0.25932684540748596,-0.2753278911113739,-0.2753278911113739,-0.25932684540748596,-0.22825466096401215,-0.18391713500022888,-0.1288910210132599,-0.06637421995401382,-1.917815576756455e-14,0.06637421995401382,0.1288910210132599,0.18391713500022888,0.22825466096401215,0.25932684540748596,0.2753278911113739,0.2753278911113739,0.25932684540748596,0.22825466096401215,0.18391713500022888,0.1288910210132599,0.06637421995401382],[0.09834969788789749,0.18391713500022888,0.24558131396770477,0.2753278911113739,0.2692908048629761,0.22825466096401215,0.1575528085231781,0.06637421995401382,-0.033430859446525574,-0.1288910210132599,-0.20759953558444977,-0.25932684540748596,-0.2773500978946686,-0.25932684540748596,-0.20759953558444977,-0.1288910210132599,-0.033430859446525574,0.06637421995401382,0.1575528085231781,0.22825466096401215,0.2692908048629761,0.2753278911113739,0.24558131396770477,0.18391713500022888,0.09834969788789749],[0.1288910210132599,0.22825466096401215,0.2753278911113739,0.25932684540748596,0.18391713500022888,0.06637421995401382,-0.06637421995401382,-0.18391713500022888,-0.25932684540748596,-0.2753278911113739,-0.22825466096401215,-0.1288910210132599,-1.2834394835694717e-14,0.1288910210132599,0.22825466096401215,0.2753278911113739,0.25932684540748596,0.18391713500022888,0.06637421995401382,-0.06637421995401382,-0.18391713500022888,-0.25932684540748596,-0.2753278911113739,-0.22825466096401215,-0.1288910210132599],[-0.1575528085231781,-0.25932684540748596,-0.2692908048629761,-0.18391713500022888,-0.033430859446525574,0.1288910210132599,0.24558131396770477,0.2753278911113739,0.20759953558444977,0.06637421995401382,-0.09834969788789749,-0.22825466096401215,-0.2773500978946686,-0.22825466096401215,-0.09834969788789749,0.06637421995401382,0.20759953558444977,0.2753278911113739,0.24558131396770477,0.1288910210132599,-0.033430859446525574,-0.18391713500022888,-0.2692908048629761,-0.25932684540748596,-0.1575528085231781],[-0.18391713500022888,-0.2753278911113739,-0.22825466096401215,-0.06637421995401382,0.1288910210132599,0.25932684540748596,0.25932684540748596,0.1288910210132599,-0.06637421995401382,-0.22825466096401215,-0.2753278911113739,-0.18391713500022888,-4.583645525716709e-15,0.18391713500022888,0.2753278911113739,0.22825466096401215,0.06637421995401382,-0.1288910210132599,-0.25932684540748596,-0.25932684540748596,-0.1288910210132599,0.06637421995401382,0.22825466096401215,0.2753278911113739,0.18391713500022888],[-0.20759953558444977,-0.2753278911113739,-0.1575528085231781,0.06637421995401382,0.24558131396770477,0.25932684540748596,0.09834969788789749,-0.1288910210132599,-0.2692908048629761,-0.22825466096401215,-0.033430859446525574,0.18391713500022888,0.2773500978946686,0.18391713500022888,-0.033430859446525574,-0.22825466096401215,-0.2692908048629761,-0.1288910210132599,0.09834969788789749,0.25932684540748596,0.24558131396770477,0.06637421995401382,-0.1575528085231781,-0.2753278911113739,-0.20759953558444977],[-0.22825466096401215,-0.25932684540748596,-0.06637421995401382,0.18391713500022888,0.2753278911113739,0.1288910210132599,-0.1288910210132599,-0.2753278911113739,-0.18391713500022888,0.06637421995401382,0.25932684540748596,0.22825466096401215,4.337912797388764e-15,-0.22825466096401215,-0.25932684540748596,-0.06637421995401382,0.18391713500022888,0.2753278911113739,0.1288910210132599,-0.1288910210132599,-0.2753278911113739,-0.18391713500022888,0.06637421995401382,0.25932684540748596,0.22825466096401215],[-0.24558131396770477,-0.22825466096401215,0.033430859446525574,0.25932684540748596,0.20759953558444977,-0.06637421995401382,-0.2692908048629761,-0.18391713500022888,0.09834969788789749,0.2753278911113739,0.1575528085231781,-0.1288910210132599,-0.2773500978946686,-0.1288910210132599,0.1575528085231781,0.2753278911113739,0.09834969788789749,-0.18391713500022888,-0.2692908048629761,-0.06637421995401382,0.20759953558444977,0.25932684540748596,0.033430859446525574,-0.22825466096401215,-0.24558131396770477],[-0.25932684540748596,-0.18391713500022888,0.1288910210132599,0.2753278911113739,0.06637421995401382,-0.22825466096401215,-0.22825466096401215,0.06637421995401382,0.2753278911113739,0.1288910210132599,-0.18391713500022888,-0.25932684540748596,-2.6265288907570752e-15,0.25932684540748596,0.18391713500022888,-0.1288910210132599,-0.2753278911113739,-0.06637421995401382,0.22825466096401215,0.22825466096401215,-0.06637421995401382,-0.2753278911113739,-0.1288910210132599,0.18391713500022888,0.25932684540748596],[0.2692908048629761,0.1288910210132599,-0.20759953558444977,-0.22825466096401215,0.09834969788789749,0.2753278911113739,0.033430859446525574,-0.25932684540748596,-0.1575528085231781,0.18391713500022888,0.24558131396770477,-0.06637421995401382,-0.2773500978946686,-0.06637421995401382,0.24558131396770477,0.18391713500022888,-0.1575528085231781,-0.25932684540748596,0.033430859446525574,0.2753278911113739,0.09834969788789749,-0.22825466096401215,-0.20759953558444977,0.1288910210132599,0.2692908048629761],[0.2753278911113739,0.06637421995401382,-0.25932684540748596,-0.1288910210132599,0.22825466096401215,0.18391713500022888,-0.18391713500022888,-0.22825466096401215,0.1288910210132599,0.25932684540748596,-0.06637421995401382,-0.2753278911113739,-2.224606886845463e-15,0.2753278911113739,0.06637421995401382,-0.25932684540748596,-0.1288910210132599,0.22825466096401215,0.18391713500022888,-0.18391713500022888,-0.22825466096401215,0.1288910210132599,0.25932684540748596,-0.06637421995401382,-0.2753278911113739],[0.2773500978946686,1.2625047254649028e-16,-0.2773500978946686,-6.312523627324514e-16,0.2773500978946686,8.83753307825432e-16,-0.2773500978946686,-8.83753307825432e-16,0.2773500978946686,1.5781309068311285e-15,-0.2773500978946686,-4.734392720493385e-16,0.2773500978946686,2.0200075607438445e-15,-0.2773500978946686,-1.7990691808479273e-15,0.2773500978946686,1.6728188141805554e-15,-0.2773500978946686,-1.4518804342846382e-15,0.2773500978946686,-3.4718879950284827e-16,-0.2773500978946686,1.2625047254649028e-15,0.2773500978946686],[-0.2753278911113739,0.06637421995401382,0.25932684540748596,-0.1288910210132599,-0.22825466096401215,0.18391713500022888,0.18391713500022888,-0.22825466096401215,-0.1288910210132599,0.25932684540748596,0.06637421995401382,-0.2753278911113739,-1.0026396759284933e-15,0.2753278911113739,-0.06637421995401382,-0.25932684540748596,0.1288910210132599,0.22825466096401215,-0.18391713500022888,-0.18391713500022888,0.22825466096401215,0.1288910210132599,-0.25932684540748596,-0.06637421995401382,0.2753278911113739],[0.2692908048629761,-0.1288910210132599,-0.20759953558444977,0.22825466096401215,0.09834969788789749,-0.2753278911113739,0.033430859446525574,0.25932684540748596,-0.1575528085231781,-0.18391713500022888,0.24558131396770477,0.06637421995401382,-0.2773500978946686,0.06637421995401382,0.24558131396770477,-0.18391713500022888,-0.1575528085231781,0.25932684540748596,0.033430859446525574,-0.2753278911113739,0.09834969788789749,0.22825466096401215,-0.20759953558444977,-0.1288910210132599,0.2692908048629761],[-0.25932684540748596,0.18391713500022888,0.1288910210132599,-0.2753278911113739,0.06637421995401382,0.22825466096401215,-0.22825466096401215,-0.06637421995401382,0.2753278911113739,-0.1288910210132599,-0.18391713500022888,0.25932684540748596,1.770693656524247e-16,-0.25932684540748596,0.18391713500022888,0.1288910210132599,-0.2753278911113739,0.06637421995401382,0.22825466096401215,-0.22825466096401215,-0.06637421995401382,0.2753278911113739,-0.1288910210132599,-0.18391713500022888,0.25932684540748596],[0.24558131396770477,-0.22825466096401215,-0.033430859446525574,0.25932684540748596,-0.20759953558444977,-0.06637421995401382,0.2692908048629761,-0.18391713500022888,-0.09834969788789749,0.2753278911113739,-0.1575528085231781,-0.1288910210132599,0.2773500978946686,-0.1288910210132599,-0.1575528085231781,0.2753278911113739,-0.09834969788789749,-0.18391713500022888,0.2692908048629761,-0.06637421995401382,-0.20759953558444977,0.25932684540748596,-0.033430859446525574,-0.22825466096401215,0.24558131396770477],[0.22825466096401215,-0.25932684540748596,0.06637421995401382,0.18391713500022888,-0.2753278911113739,0.1288910210132599,0.1288910210132599,-0.2753278911113739,0.18391713500022888,0.06637421995401382,-0.25932684540748596,0.22825466096401215,9.870699398085384e-16,-0.22825466096401215,0.25932684540748596,-0.06637421995401382,-0.18391713500022888,0.2753278911113739,-0.1288910210132599,-0.1288910210132599,0.2753278911113739,-0.18391713500022888,-0.06637421995401382,0.25932684540748596,-0.22825466096401215],[-0.20759953558444977,0.2753278911113739,-0.1575528085231781,-0.06637421995401382,0.24558131396770477,-0.25932684540748596,0.09834969788789749,0.1288910210132599,-0.2692908048629761,0.22825466096401215,-0.033430859446525574,-0.18391713500022888,0.2773500978946686,-0.18391713500022888,-0.033430859446525574,0.22825466096401215,-0.2692908048629761,0.1288910210132599,0.09834969788789749,-0.25932684540748596,0.24558131396770477,-0.06637421995401382,-0.1575528085231781,0.2753278911113739,-0.20759953558444977],[-0.18391713500022888,0.2753278911113739,-0.22825466096401215,0.06637421995401382,0.1288910210132599,-0.25932684540748596,0.25932684540748596,-0.1288910210132599,-0.06637421995401382,0.22825466096401215,-0.2753278911113739,0.18391713500022888,8.371955066048889e-16,-0.18391713500022888,0.2753278911113739,-0.22825466096401215,0.06637421995401382,0.1288910210132599,-0.25932684540748596,0.25932684540748596,-0.1288910210132599,-0.06637421995401382,0.22825466096401215,-0.2753278911113739,0.18391713500022888],[-0.1575528085231781,0.25932684540748596,-0.2692908048629761,0.18391713500022888,-0.033430859446525574,-0.1288910210132599,0.24558131396770477,-0.2753278911113739,0.20759953558444977,-0.06637421995401382,-0.09834969788789749,0.22825466096401215,-0.2773500978946686,0.22825466096401215,-0.09834969788789749,-0.06637421995401382,0.20759953558444977,-0.2753278911113739,0.24558131396770477,-0.1288910210132599,-0.033430859446525574,0.18391713500022888,-0.2692908048629761,0.25932684540748596,-0.1575528085231781],[-0.1288910210132599,0.22825466096401215,-0.2753278911113739,0.25932684540748596,-0.18391713500022888,0.06637421995401382,0.06637421995401382,-0.18391713500022888,0.25932684540748596,-0.2753278911113739,0.22825466096401215,-0.1288910210132599,-3.0802549003271684e-16,0.1288910210132599,-0.22825466096401215,0.2753278911113739,-0.25932684540748596,0.18391713500022888,-0.06637421995401382,-0.06637421995401382,0.18391713500022888,-0.25932684540748596,0.2753278911113739,-0.22825466096401215,0.1288910210132599],[-0.09834969788789749,0.18391713500022888,-0.24558131396770477,0.2753278911113739,-0.2692908048629761,0.22825466096401215,-0.1575528085231781,0.06637421995401382,0.033430859446525574,-0.1288910210132599,0.20759953558444977,-0.25932684540748596,0.2773500978946686,-0.25932684540748596,0.20759953558444977,-0.1288910210132599,0.033430859446525574,0.06637421995401382,-0.1575528085231781,0.22825466096401215,-0.2692908048629761,0.2753278911113739,-0.24558131396770477,0.18391713500022888,-0.09834969788789749],[-0.06637421995401382,0.1288910210132599,-0.18391713500022888,0.22825466096401215,-0.25932684540748596,0.2753278911113739,-0.2753278911113739,0.25932684540748596,-0.22825466096401215,0.18391713500022888,-0.1288910210132599,0.06637421995401382,6.798085926082532e-17,-0.06637421995401382,0.1288910210132599,-0.18391713500022888,0.22825466096401215,-0.25932684540748596,0.2753278911113739,-0.2753278911113739,0.25932684540748596,-0.22825466096401215,0.18391713500022888,-0.1288910210132599,0.06637421995401382],[0.033430859446525574,-0.06637421995401382,0.09834969788789749,-0.1288910210132599,0.1575528085231781,-0.18391713500022888,0.20759953558444977,-0.22825466096401215,0.24558131396770477,-0.25932684540748596,0.2692908048629761,-0.2753278911113739,0.2773500978946686,-0.2753278911113739,0.2692908048629761,-0.25932684540748596,0.24558131396770477,-0.22825466096401215,0.20759953558444977,-0.18391713500022888,0.1575528085231781,-0.1288910210132599,0.09834969788789749,-0.06637421995401382,0.033430859446525574]]

  RLambda = [4.6419172286987305,15.514562606811523,33.45938491821289,58.21471405029297,89.4195556640625,126.61888122558594,169.27023315429688,216.7516632080078,268.37078857421875,323.3748474121094,380.9618225097656,440.29193115234375,500.5,560.7080688476562,620.0381469726562,677.6251220703125,732.6292114257812,784.2483520507812,831.7297973632812,874.381103515625,911.5804443359375,942.7852783203125,967.5405883789062,985.4854125976562,996.3580932617188]

  var alpha = 0.003
  var beta = 0.8
  var b = zeros(25); b[0] =  249.75
  var iter = geniterMomentum(RQ, RLambda, b, alpha, beta).iter
  var res = function(i) { return iter(i)[0] }

  var sampleSVG = d3.select("#iterates")
    .style("display", "block")
      .append("svg")
      .attr("width", 770)
      .attr("height", 700)

  var rosen = d3.scaleLinear().domain([0,0.2,0.5,0.6,0.8]).range(colorbrewer.YlGnBu[5]);
  var jet = d3.scaleLinear().domain([-0,0.1,0.2,0.5,1.0,8]).range(colorbrewer.YlGnBu[5]);
  var contrast = d3.scaleLinear().domain([-12,0,12]).range(colorbrewer.YlGnBu[3]);

  var xstar = iter(10000000)
  var numIters = 52

  var pathstr = "M 8,0"

  for (var j = 0; j < 27; j++) {
    if (j != 26 && j > 0) {
      pathstr = pathstr + "L " + j*8 + "," + (j-1)*8 +  "L " + j*8 + "," + j*8
    }
    if (j == 0) {
      //pathstr = pathstr + "L " + j*8 + "," + j*8
    }
    if (j == 26) {
      pathstr = pathstr + "L " + j*8 + "," + (j-1)*8
    }
  }
  pathstr = pathstr + "L "+ 26*8 + ",0 L 8,0 "


  var Disps3 = []
  var errorPlot = sampleSVG.append("g").attr("transform", "translate(" + 440 + ",0)")
  for (var j = 0; j < numIters; j++) {
    var disp = errorPlot.append("g")

    var denter = disp.selectAll("rect")
        .data(iter(j)[1])
        .enter()

    denter.append("rect")
        .style("fill", function(d,i) { return rosen(d) })
        .attr("height", 7.7)
        .attr("width", 7.7)
        .attr("x", function(d, i){return i*8+ 10})
        .attr("y", function(d, i){return j*8 })
    Disps3.push(disp)
  }

  errorPlot.append("g").append("path").attr("d", pathstr).style("stroke","black").style("fill","none").style("stroke-width","1px").attr("transform", "translate(2,0.5)")

  var Disps4 = []
  var errorPlot = sampleSVG.append("g").attr("transform", "translate(" + 220 + ",0)")
  for (var j = 0; j < numIters; j++) {
    var disp = errorPlot.append("g")

    var denter = disp.selectAll("rect")
        .data(iter(j)[1])
        .enter()

    denter.append("rect")
        .style("fill", function(d,i) { return jet(d) })
        .attr("height", 7.7)
        .attr("width", 7.7)
        .attr("x", function(d, i){return i*8+ 10})
        .attr("y", function(d, i){return j*8 })

    if ((j % 10 == 0) || j == 0) {
    disp.append("text")
        .attr("class", "figtext")
        .attr("text-anchor", "end")
        .attr("x", 0)
        .attr("y", function(d, i){return (j*8 + 10) })
        .html( (j == 0) ? "Iteration 0" : "" + j)
    }

    Disps4.push(disp)
  }

  errorPlot.append("g").append("path").attr("d", pathstr).style("stroke","black").style("fill","none").style("stroke-width","1px").attr("transform", "translate(2,0.5)")


  function update(alpha, beta) {
    var iter = geniterMomentum(RQ, RLambda, b, Math.max(alpha, 0.00001), Math.min(Math.max(beta, 0.00001), 0.999)).iter
    for (var j = 0; j < numIters; j++) {
      var iterj = iter(j)
      Disps3[j].selectAll("rect").data(iterj[1]).merge(Disps3[j]).style("fill", function(d,i) { return rosen(Math.abs(d - xstar[1][i]) ) })
      Disps4[j].selectAll("rect").data(iterj[1]).merge(Disps4[j]).style("fill", function(d,i) { return jet(d) })

    }
  }

  var reset = slider2D(d3.select("#sliderz"), function(x,y) { update(x/RLambda[RLambda.length - 1],y) }, 4.6, 996.35)

  update(0.0035749243028120182, 0.8095238095238095)

  colorMap( d3.select("#rosen_colorbar1"),
            194,
            rosen,
            d3.scaleLinear().domain([0,1]).range([0, 194]) )

  colorMap( d3.select("#rosen_colorbar2"),
            194,
            jet,
            d3.scaleLinear().domain([0,1]).range([0, 194]) );

  // Swoopy Annotator
  var annotations = [
    {
      "x": 0,
      "y": 0,
      "path": "M 817,95 A 23.869 23.869 0 0 0 792,72",
      "text": "Optimal parameters",
      "textOffset": [
        767,
        109
      ]
    }
  ]
  var sel = drawAnnotations(d3.select("#rosenViz"), annotations)
  sel.selectAll("text").style("cursor", "pointer").style("pointer-events","all").on("click", reset)

  callback(null);
  });

  </script>
  <p>
  The observations made in the above diagram are true for any Linear First Order algorithm. Let us prove this. First observe that each component of the gradient depends only on the values directly before and after it:

  <dt-math block>
\nabla f(x)_{i}=2w_{i}-w_{i-1}-w_{i+1} +\frac{4}{\kappa-1} w_{i}, \qquad i \neq 1.
  </dt-math>

  Therefore the fact we start at 0 guarantees that that component must remain stoically there till an element either before or after it turns nonzero. And therefore, by induction, for any linear first order algorithm,
  </p>
  <div style="position:relative; height:178px">
  <dt-math block>
  \begin{array}{lllllllll}
    w^{0} & = & [~~0, & 0, & 0, & \ldots & 0, & 0, & \ldots & 0~]\\[0.35em]
    w^{1} & = & [~w_{1}^{1}, & 0, & 0, & \ldots & 0, & 0, & \ldots & 0~]\\[0.35em]
    w^{2} & = & [~w_{1}^{2}, & w_{2}^{2}, & 0, & \ldots & 0, & 0, & \ldots & 0~]\\[0.35em]
     & ~ \vdots \\
   w^{k} & = & [~w_{1}^{k}, & w_{2}^{k}, & w_{3}^{k}, & \ldots &  w_{k}^{k}, & 0, & \ldots & 0~].\\
  \end{array}
  </dt-math>
  </div>
  <p>
  Think of this restriction as a "speed of light" of information transfer. Error signals will take at least <dt-math>k</dt-math> steps to move from <dt-math>w_0</dt-math> to <dt-math>w_k</dt-math>. We can therefore sum up the errors which cannot have changed yet<dt-fn>We use the infinity norm to measure our error, similar results can be derived for the 1 and 2 norms.</dt-fn>:

  <dt-math block>
  \begin{aligned}
\|w^{k}-w^{\star}\|_{\infty}&\geq\max_{i\geq k+1}\{|w_{i}^{\star}|\}\\[0.9em]&=\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^{k+1}\\[0.9em]&=\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^{k}\|w^{0}-w^{\star}\|_{\infty}.
  \end{aligned}
  </dt-math>

  As <dt-math>n</dt-math> gets large, the condition number of <dt-math>f^n</dt-math> approaches <dt-math>\kappa</dt-math>. And the gap therefore closes; the convergence rate that momentum promises matches the best any linear first order algorithm can do. And we arrive at the disappointing conclusion that on this problem, we cannot do better.
  </p>
  <p>
  Like many such lower bounds, this result must not be taken literally, but spiritually. It, perhaps, gives a sense of closure and finality to our investigation. But this is not the final word on first order optimization. This lower bound does not preclude the possibility, for example, of reformulating the problem to change the condition number itself! There is still much room for speedups, if you understand the right places to look.
  </p>
  <h2>Momentum with Stochastic Gradients</h2>

  <p>
  There is a final point worth addressing. All the discussion above assumes access to the true gradient -- a luxury seldom afforded in modern machine learning. Computing the exact gradient requires a full pass over all the data, the cost of which can be prohibitively expensive. Instead, randomized approximations of the gradient, like minibatch sampling, are often used as a plug-in replacement of <dt-math>\nabla f(w)</dt-math>. We can write the approximation in two parts,
  </p>
  <figure style="width:510px; height:50px; display:block; margin-left:auto; margin-right:auto; position:relative" id = "truegradientpluserror">

  <div style="position:relative; top:-45px">

  <span style="position:absolute; left:50px; top:9px">
  <dt-math block>\nabla f(w)</dt-math>
  </span>

  <figcaption style="position:absolute; left:50px; top:70px; width:140px;">
  the true gradient
  </figcaption>

  <span style="position:absolute; left:170px; top:9px">
  <dt-math block>+</dt-math>
  </span>

  <span style="position:absolute; left:230px; top:9px">
  <dt-math block>\text{error}(w).</dt-math>
  </span>
  <figcaption style="position:absolute; left:230px; top:70px;width:300px;">
  and an approximation error. <br>If the estimator is unbiased e.g. <dt-math>\mathbf{E}[\text{error}(w)] = 0</dt-math>
  </figcaption>
  </div>
  </figure>
  <script>
  renderMath(document.getElementById("truegradientpluserror"))
  </script>

  <p>
  It is helpful to think of our approximate gradient as the injection of a special kind of noise into our iteration. And using the machinery developed in the previous sections, we can deal with this extra term directly. On a quadratic, the error term cleaves cleanly into a separate term, where <dt-fn>
  The momentum iterations are
  <dt-math block>
  \begin{aligned}
  z^{k+1}&=\beta z^{k}+ A w^{k}  + \text{error}(w^k) \\[0.4em]
  w^{k+1}&=w^{k}-\alpha z^{k+1}.
  \end{aligned}
  </dt-math>
  which, after a change of variables, become
  <dt-math block>
\left(\!\!\begin{array}{cc}
1 & 0\\
\alpha & 1
\end{array}\!\!\right)\Bigg(\!\!\begin{array}{c}
y_{i}^{k+1}\\
x_{i}^{k+1}
\end{array}\!\!\Bigg)=\left(\!\!\begin{array}{cc}
\beta & \lambda_{i}\\
0 & 1
\end{array}\!\!\right)\left(\!\!\begin{array}{c}
y_{i}^{k}\\
x_{i}^{k}
\end{array}\!\!\right)+\left(\!\!\begin{array}{c}
\epsilon_{i}^{k}\\
0
\end{array}\!\!\right)
  </dt-math>
  Inverting the <dt-math>2 \times 2</dt-math> matrix on the left, and applying the formula recursively yields the final solution.
  </dt-fn>
  </p>

  <figure style="width:610px; height:80px; display:block; margin-left:auto; margin-right:auto; position:relative" id="iteratespluserror">

  <div style="position:relative; top:-35px">

  <span style="position:absolute; left:0px; top:0px">
  <dt-math block>    \left(\begin{array}{c}
  y_{i}^{k}\\
  x_{i}^{k}
  \end{array}\right)</dt-math>
  </span>

  <figcaption style="position:absolute; left:00px; top:95px; width:120px;">
  the noisy iterates are a sum of
  </figcaption>

  <span style="position:absolute; left:120px; top:10px">
  <dt-math block>=</dt-math>
  </span>

  <span style="position:absolute; left:170px; top:0px">
  <dt-math block>R^{k}\left(\begin{array}{c}
  y_{i}^{0}\\
  x_{i}^{0}
  \end{array}\right)</dt-math>
  </span>
  <figcaption style="position:absolute; left:170px; top:95px;width:200px;">
  the noiseless, deterministic iterates and
  </figcaption>

  <span style="position:absolute; left:320px; top:10px">
  <dt-math block>+</dt-math>
  </span>


  <span style="position:absolute; left:375px; top:-10px">
  <dt-math block>\epsilon^k_i \sum_{j=1}^{k}R^{k-j}\left(\begin{array}{c}
1\\
-\alpha
\end{array}\right)</dt-math>
  </span>

  <figcaption style="position:absolute; left:375px; top:95px;width:200px;">
  a decaying sum of the errors, where <dt-math>\epsilon^k = Q \cdot \text{error}(w^k)</dt-math>.
  </figcaption>

  </div>
  </figure>
  <script>
  renderMath(document.getElementById("iteratespluserror"))
  </script>
  <p>
  The error term, <dt-math>\epsilon^k</dt-math>, with its dependence on the <dt-math>w^k</dt-math>, is a fairly hairy object. Following <dt-cite key="flammarion2015averaging"></dt-cite>, we model this as independent 0-mean Gaussian noise. In this simplified model, the objective also breaks into two separable components, a sum of a deterministic error and a stochastic error
     <dt-fn>On the 1D function <dt-math>f(x)=\frac{\lambda}{2}x^{2}</dt-math>, the objective value is
<dt-math block>
\begin{aligned}
\mathbf{E}f(x^{k})&=\frac{\lambda}{2}\mathbf{E}[(x^{k})^{2}]\\&=\frac{\lambda}{2}\mathbf{E}\left(e_{2}^{T}R^{k}\left(\begin{array}{c}
y^{0}\\
x^{0}
\end{array}\right)+\epsilon^{k}e_{2}^{T}\sum_{i=1}^{k}R^{k-i}\left(\begin{array}{c}
1\\
-\alpha
\end{array}\right)\right)^{2}\\&=\frac{\lambda}{2}e_{2}^{T}R^{k}\left(\begin{array}{c}
y^{0}\\
x^{0}
\end{array}\right)+\frac{\lambda}{2}\mathbf{E}\left(\epsilon^{k}e_{2}^{T}\sum_{i=1}^{k}R^{k-i}\left(\begin{array}{c}
1\\
-\alpha
\end{array}\right)\right)^{2}\\&=\frac{\lambda}{2}e_{2}^{T}R^{k}\left(\begin{array}{c}
y^{0}\\
x^{0}
\end{array}\right)+\frac{\lambda}{2}\mathbf{E}[\epsilon^{k}]\,\cdot\,\sum_{i=1}^{k}\left(e_{2}^{T}R^{k-i}\left(\begin{array}{c}
1\\
-\alpha
\end{array}\right)\right)^{2}\\&=\frac{\lambda}{2}e_{2}^{T}R^{k}\left(\begin{array}{c}
y^{0}\\
x^{0}
\end{array}\right)+\frac{\lambda\mathbf{E}[\epsilon^{k}}{2}\cdot\sum_{i=1}^{k}\gamma_{i}^{2}, \qquad \gamma_i = e_{2}^{T}R^{k-i}\left(\begin{array}{c}
1\\
-\alpha
\end{array}\right)
\end{aligned}
</dt-math>
The third inequality uses the fact that <dt-math>\mathbf{E} \epsilon^k = 0</dt-math> and the fourth uses the fact they are uncorrelated.
</dt-fn>, visualized here.
  </p>
  <script src="assets/stochastic_milestones.js"></script>
  <figure id="stochastic" style="position:relative; width:920px; height:460px">
  <figcaption style="position:absolute; text-align:left; top:30px; left:135px; width:280px; height:120px">We decompose the expected value of the objective value <dt-math>\mathbf{E} f(w) - f(w^\star)</dt-math> into a deterministic part <svg style="position:relative; top:2px; width:3px; height:14px; background:#e0ecf4"></svg> and a stochastic part <svg style="position:relative; top:2px; width:3px; height:14px; background:#9ebcda"></svg>. </figcaption>
  <figcaption style="position:absolute; text-align:left; top:210px; left:12px; width:116px; height:120px"> <span style="font-size:15px"><dt-math>\mathbf{E} f(w) - f(w^\star) </dt-math></span></figcaption>
  <figcaption style="position:absolute; text-align:right; top:240px; left:12px; width:116px; height:120px"> The small black dots are a single run of stochastic gradient</figcaption>

  <div class="figtext" style="position:absolute; left:705px; top:11px">Step-size α = </div>
  <div class="figtext" style="position:absolute; left:464px; top:30px">Momentum β = </div>

  <figcaption style="position:absolute; text-align:left; top:400px; left:135px; width:640px; height:60px">As <dt-cite key="sutskever2013importance"></dt-cite> observes, the optimization has two phases. In the initial transient phase the magnitude of the noise is smaller than the magnitude of the gradient, and Momentum still makes good progress. In the second, stochastic phase, the noise overwhelms the gradient, and momentum is less effective.</figcaption>

  <div id = "sliderStep2DD" style="position:absolute; left:550px; width:250px; height:100px; top:10px"></div>
  <div id = "stochasticgd" style="position:relative; left:15px"></div>
  </figure>

  <script>
  deleteQueue.push(renderLoading(d3.select("#stochastic")))
  renderQueue.push(function(callback) {
      var graphDiv = d3.select("#stochasticgd")
                   .style("width",  920 + "px")
                   .style("height", 300 + "px")
                   .style("top", "170px")
                   .style("position", "relative")
                   .style("margin-left", "auto")
                   .style("margin-right", "auto")
                   .attr("width", 920)
                   .attr("height", 500)

    var svg = graphDiv.append("svg").attr("width", 940).attr("height", 500)

    var update = renderStochasticMilestones(svg, function() {})
    var slidera = slider2D(d3.select("#sliderStep2DD"), function(x,y) { update(x,y) }, 1, 100)
    slidera(0.5,0.5)
  });
  </script>
  <p>
  Note that there are a set of unfortunate tradeoffs which seem to pit the two components of error against each other. Lowering the step-size, for example, decreases the stochastic error, but also slows down the rate of convergence. And increasing momentum, contrary to popular belief, causes the errors to compound. Despite these undesirable properties, stochastic gradient descent with momentum has still been shown to have competitive performance on neural networks. As <dt-cite key="sutskever2013importance"></dt-cite> has observed, the transient phase seems to matter more than the fine-tuning phase in machine learning. And in fact, it has been recently suggested <dt-cite key="zhang2016understanding"></dt-cite> that this noise is a good thing -- it acts as a implicit regularizer, which, like early stopping, prevents overfitting in the fine-tuning phase of optimization.
  </p>
  <hr>
  <h2>Onwards and Downwards</h2>
  <p>
  The study of acceleration is seeing a small revival within the optimization community. If the ideas in this article excite you, you may wish to read <dt-cite key="su2014differential"></dt-cite>, which fully explores the idea of momentum as the discretization of a certain differential equation. But other, less physical, interpretations exist. There is an algebraic interpretation of momentum in terms of approximating polynomials <dt-cite key="rutishauser1959theory,hardtzen"></dt-cite>. Geometric interpretations are emerging <dt-cite key="bubeck2015geometric,drusvyatskiy2016optimal"></dt-cite>, connecting momentum to older methods, like the Ellipsoid method. And finally, there are interpretations relating momentum to duality <dt-cite key="allen2014linear"></dt-cite>, perhaps providing a clue as how to accelerate second order methods and Quasi Newton (for a first step, see <dt-cite key="nesterov2008accelerating"></dt-cite>). But like the proverbial blind men feeling an elephant, momentum seems like something bigger than the sum of its parts. One day, hopefully soon, the many perspectives will converge into a satisfying whole.
  </p>
</dt-article>

<p>

  </p>


<dt-appendix class="centered">


  <h3>Acknowledgments</h3>

  <p>
  I am deeply indebted to the editorial contributions of Shan Carter and Chris Olah, without which this article would be greatly impoverished. Shan Carter provided complete redesigns of many of my original interactive widgets, a visual coherence for all the figures, and valuable optimizations to the page's performance. Chris Olah provided impeccable editorial feedback at all levels of detail and abstraction - from the structure of the content, to the alignment of equations.
  </p>
  <p>
  I am also grateful to Michael Nielsen for providing the title of this article, which really tied the article together. Marcos Ginestra provided editorial input for the earliest drafts of this article, and spiritual encouragement when I needed it the most. And my gratitude extends to my reviewers, Matt Hoffman and Anonymous Reviewer B for their astute observations and criticism. I would like to thank Reviewer B, in particular, for pointing out two non-trivial errors in the original manuscript (discussion <a href="https://github.com/distillpub/post--momentum/issues/34">here</a>). The contour plotting library for the hero visualization is the joint work of Ben Frederickson, Jeff Heer and Mike Bostock.
  </p>

  <p>
  Many thanks to the numerous pull requests and issues filed on github. Thanks in particular, to Osemwaro Pedro for spotting an off by one error in one of the equations. And also to Dan Schmidt who did an editing pass over the whole project, correcting numerous typographical and grammatical errors.
  </p>

  <h4>Discussion and Review</h4>
  <p>
  <a href = "https://github.com/distillpub/post--momentum/issues/29">Reviewer A - Matt Hoffman</a><br>
  <a href = "https://github.com/distillpub/post--momentum/issues/34">Reviewer B - Anonymous</a><br>
  <a href = "https://github.com/distillpub/post--momentum/issues/51">Discussion with User derifatives</a>
  </p>

</dt-appendix>


<script type="text/bibliography">
@article{o2015adaptive,
  title={Adaptive restart for accelerated gradient schemes},
  author={O’Donoghue, Brendan and Candes, Emmanuel},
  journal={Foundations of computational mathematics},
  volume={15},
  number={3},
  pages={715--732},
  year={2015},
  publisher={Springer},
  url={https://arxiv.org/abs/1204.3982},
  doi={10.1007/s10208-013-9150-3}
}

@article{flammarion2015averaging,
  title={From averaging to acceleration, there is only a step-size},
  author={Flammarion, Nicolas and Bach, Francis},
  booktitle={Proceedings of the International Conference on Learning Theory (COLT)},
  year={2015},
  url={https://arxiv.org/abs/1504.01577}
}

@article{ioffe2015batch,
  title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
  author={Ioffe, Sergey and Szegedy, Christian},
  journal={arXiv preprint arXiv:1502.03167},
  year={2015},
  url={https://arxiv.org/abs/1502.03167}
}

@article{duchi2011adaptive,
  title={Adaptive subgradient methods for online learning and stochastic optimization},
  author={Duchi, John and Hazan, Elad and Singer, Yoram},
  journal={Journal of Machine Learning Research},
  volume={12},
  number={Jul},
  pages={2121--2159},
  year={2011},
  url={http://jmlr.org/papers/v12/duchi11a.html}
}

@article{kingma2014adam,
  title={Adam: A method for stochastic optimization},
  author={Kingma, Diederik and Ba, Jimmy},
  journal={arXiv preprint arXiv:1412.6980},
  year={2014},
  url={https://arxiv.org/abs/1412.6980}
}

@book{briggs2000multigrid,
  title={A multigrid tutorial},
  author={Briggs, William L and Henson, Van Emden and McCormick, Steve F},
  year={2000},
  publisher={SIAM},
  doi={10.1137/1.9780898719505}
}

@article{su2014differential,
  title={A differential equation for modeling Nesterov’s accelerated gradient method: Theory and insights},
  author={Su, Weijie and Boyd, Stephen and Candes, Emmanuel},
  booktitle={Advances in Neural Information Processing Systems},
  pages={2510--2518},
  year={2014},
  url={https://arxiv.org/abs/1503.01243}
}

@article{polyak1964some,
  title={Some methods of speeding up the convergence of iteration methods},
  author={Polyak, Boris T},
  journal={USSR Computational Mathematics and Mathematical Physics},
  volume={4},
  number={5},
  pages={1--17},
  year={1964},
  publisher={Elsevier},
  url={https://www.researchgate.net/profile/Boris_Polyak2/publication/243648538_Some_methods_of_speeding_up_the_convergence_of_iteration_methods/links/5666fa3808ae34c89a01fda1.pdf},
  doi={10.1016/0041-5553(64)90137-5}
}

@article{flammarion2015averaging,
  title={From Averaging to Acceleration, There is Only a Step-size.},
  author={Flammarion, Nicolas and Bach, Francis R},
  booktitle={COLT},
  pages={658--695},
  year={2015},
  url={https://arxiv.org/abs/1504.01577}
}

@article{williamsnthpower,
  title={The Nth Power of a 2x2 Matrix.},
  author={Williams, Kenneth},
  journal={Mathematics Magazine},
  volume={65},
  number={5},
  pages={336},
  year={1992},
  publisher={MAA},
  url={http://people.math.carleton.ca/~williams/papers/pdf/175.pdf},
  doi={10.2307/2691246}
}

@article{hardtzen,
  title={The Zen of Gradient Descent},
  author={Hardt, Moritz},
  year={2013},
  url={http://blog.mrtz.org/2013/09/07/the-zen-of-gradient-descent.html}
}

@book{nesterov2013introductory,
  title={Introductory lectures on convex optimization: A basic course},
  author={Nesterov, Yurii},
  volume={87},
  year={2013},
  publisher={Springer Science \& Business Media},
  doi={10.1007/978-1-4419-8853-9}
}

@article{rutishauser1959theory,
  title={Theory of gradient methods},
  author={Rutishauser, Heinz},
  booktitle={Refined iterative methods for computation of the solution and the eigenvalues of self-adjoint boundary value problems},
  pages={24--49},
  year={1959},
  publisher={Springer},
  doi={10.1007/978-3-0348-7224-9_2}
}

@article{bubeck2015geometric,
  title={A geometric alternative to Nesterov's accelerated gradient descent},
  author={Bubeck, S{\'e}bastien and Lee, Yin Tat and Singh, Mohit},
  journal={arXiv preprint arXiv:1506.08187},
  year={2015},
  url={https://arxiv.org/pdf/1506.08187.pdf}
}

@article{drusvyatskiy2016optimal,
  title={An optimal first order method based on optimal quadratic averaging},
  author={Drusvyatskiy, Dmitriy and Fazel, Maryam and Roy, Scott},
  journal={arXiv preprint arXiv:1604.06543},
  year={2016},
  url={https://arxiv.org/pdf/1604.06543.pdf}
}

@article{allen2014linear,
  title={Linear coupling: An ultimate unification of gradient and mirror descent},
  author={Allen-Zhu, Zeyuan and Orecchia, Lorenzo},
  journal={arXiv preprint arXiv:1407.1537},
  year={2014},
  url={https://arxiv.org/pdf/1407.1537.pdf}
}

@article{nesterov2008accelerating,
  title={Accelerating the cubic regularization of Newton’s method on convex problems},
  author={Nesterov, Yu},
  journal={Mathematical Programming},
  volume={112},
  number={1},
  pages={159--181},
  year={2008},
  publisher={Springer},
  doi={10.1007/s10107-006-0089-x},
  url={http://folk.uib.no/ssu029/Pdf_file/Nesterov08.pdf}
}

@article{qian1999momentum,
  title={On the momentum term in gradient descent learning algorithms},
  author={Qian, Ning},
  journal={Neural networks},
  volume={12},
  number={1},
  pages={145--151},
  year={1999},
  publisher={Elsevier},
  doi={10.1016/s0893-6080(98)00116-6},
  url={https://pdfs.semanticscholar.org/735d/4220d5579cc6afe956d9f6ea501a96ae99e2.pdf}
}

@article{amari1998natural,
  title={Natural gradient works efficiently in learning},
  author={Amari, Shun-Ichi},
  journal={Neural computation},
  volume={10},
  number={2},
  pages={251--276},
  year={1998},
  publisher={MIT Press},
  doi={10.1162/089976698300017746},
  url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.452.7280&rep=rep1&type=pdf}
}

@inproceedings{wiesler2011convergence,
  title={A convergence analysis of log-linear training},
  author={Wiesler, Simon and Ney, Hermann},
  booktitle={Advances in Neural Information Processing Systems},
  pages={657--665},
  year={2011},
  url={http://papers.nips.cc/paper/4421-a-convergence-analysis-of-log-linear-training.pdf}
}

@article{sutskever2013importance,
  title={On the importance of initialization and momentum in deep learning.},
  author={Sutskever, Ilya and Martens, James and Dahl, George E and Hinton, Geoffrey E},
  journal={ICML (3)},
  volume={28},
  pages={1139--1147},
  year={2013},
  url={http://www.jmlr.org/proceedings/papers/v28/sutskever13.pdf}
}


@article{hintonNIPS,
  title={Deep Learning, NIPS'2015 Tutorial},
  author={Hinton, Geoff and Bengio, Yoshua and LeCun, Yann},
  year={2015},
  url={http://www.iro.umontreal.ca/~bengioy/talks/DL-Tutorial-NIPS2015.pdf}
}

@article{zhang2016understanding,
  title={Understanding deep learning requires rethinking generalization},
  author={Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  journal={arXiv preprint arXiv:1611.03530},
  year={2016},
  url={https://arxiv.org/abs/1611.03530}
}

@article{lessard2016analysis,
  title={Analysis and design of optimization algorithms via integral quadratic constraints},
  author={Lessard, Laurent and Recht, Benjamin and Packard, Andrew},
  journal={SIAM Journal on Optimization},
  volume={26},
  number={1},
  pages={57--95},
  year={2016},
  publisher={SIAM},
  url={https://arxiv.org/abs/1408.3595}
}

</script>

<!-- Figure render queue -->
<script>

// for (var i = 0; i < renderQueue.length; i++) {
//   renderQueue[i](function() {})
//   deleteQueue[i](function() {})
// }
  setTimeout(function() {
    var q = d3.queue(1);

    d3.zip(deleteQueue,renderQueue).forEach(function(fn) {
      q.defer(function(callback) {
        fn[1](callback);
        fn[0](callback);
        renderMath(document.body);
      });
      q.defer(function(callback) {
        setTimeout(function() {
          callback(null);
        }, 50);
      });
    });
    q.await(function(error) {
      if (error) {
        console.error("Render error.", error)
      } else {
        console.log("Render done.")
      }
    });
  }, 50);

// DEBUG
// renderQueue.forEach( function (fn) { fn( function() {}) } )


</script>