averages.tex

% This file is part of the GROMACS molecular simulation package.
%
% Copyright (c) 2013, by the GROMACS development team, led by
% David van der Spoel, Berk Hess, Erik Lindahl, and including many
% others, as listed in the AUTHORS file in the top-level source
% directory and at http://www.gromacs.org.
%
% GROMACS is free software; you can redistribute it and/or
% modify it under the terms of the GNU Lesser General Public License
% as published by the Free Software Foundation; either version 2.1
% of the License, or (at your option) any later version.
%
% GROMACS is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
% Lesser General Public License for more details.
%
% You should have received a copy of the GNU Lesser General Public
% License along with GROMACS; if not, see
% http://www.gnu.org/licenses, or write to the Free Software Foundation,
% Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
%
% If you want to redistribute modifications to GROMACS, please
% consider that scientific software is very special. Version
% control is crucial - bugs must be traceable. We will be happy to
% consider code for inclusion in the official distribution, but
% derived work must not be called official GROMACS. Details are found
% in the README & COPYING files - if they are missing, get the
% official version at http://www.gromacs.org.
%
% To help us fund GROMACS development, we humbly ask that you cite
% the research papers on the package. Check out http://www.gromacs.org

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%       AVERAGES AND FLUCTUATIONS
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Averages and fluctuations}
\section{Formulae for averaging}
{\bf Note:} this section was taken from ref~\cite{Gunsteren94a}.

When analyzing a MD trajectory averages $\left<x\right>$ and fluctuations
\beq
 \left<(\Delta x)^2\right>^{\half} ~=~ \left<[x-\left<x\right>]^2\right>^{\half}
\label{eqn:var0}
\eeq
of a quantity $x$ are to be computed.
The variance $\sigma_x$ of a series of N$_x$ values, 
\{x$_i$\}, can be computed from
\beq
\sigma_x~=~ \sum_{i=1}^{N_x} x_i^2 ~-~  \frac{1}{N_x}\left(\sum_{i=1}^{N_x}x_i\right)^2
\label{eqn:var1}
\eeq
Unfortunately this formula is numerically not very accurate, 
especially when $\sigma_x^{\half}$ is small compared to the values of $x_i$. 
The following (equivalent) expression is numerically more accurate
\beq
\sigma_x ~=~ \sum_{i=1}^{N_x} [x_i  - \left<x\right>]^2
\eeq
with
\beq
  \left<x\right> ~=~ \frac{1}{N_x} \sum_{i=1}^{N_x} x_i
\label{eqn:var2}
\eeq
Using ~\eqnsref{var1}{var2} one has to go 
through the series of $x_i$ values twice, once to determine 
$\left<x\right>$ and again to 
compute $\sigma_x$, 
whereas \eqnref{var0} requires only one sequential scan of
the series \{x$_i$\}. However, one may cast \eqnref{var1} in
another form, containing partial sums, which allows for a sequential 
update algorithm. Define the partial sum
\beq
          X_{n,m} ~=~ \sum_{i=n}^{m} x_i                      
\eeq
and the partial variance
\beq
    \sigma_{n,m} ~=~ \sum_{i=n}^{m}  \left[x_i - \frac{X_{n,m}}{m-n+1}\right]^2  
\label{eqn:sigma}
\eeq
It can be shown that
\beq
          X_{n,m+k} ~=~  X_{n,m} + X_{m+1,m+k}         
\label{eqn:Xpartial}
\eeq
and
\bea
\sigma_{n,m+k} &=& \sigma_{n,m} + \sigma_{m+1,m+k} + \left[~\frac {X_{n,m}}{m-n+1} - \frac{X_{n,m+k}}{m+k-n+1}~\right]^2~* \nonumber\\
   && ~\frac{(m-n+1)(m+k-n+1)}{k}
\label{eqn:varpartial}
\eea
For $n=1$ one finds
\beq
\sigma_{1,m+k} ~=~ \sigma_{1,m} + \sigma_{m+1,m+k}~+~
  \left[~\frac{X_{1,m}}{m} - \frac{X_{1,m+k}}{m+k}~\right]^2~ \frac{m(m+k)}{k}
\label{eqn:sig1}
\eeq
and for $n=1$ and $k=1$ ~(\eqnref{varpartial}) becomes
\bea
\sigma_{1,m+1}  &=& \sigma_{1,m} + 
                        \left[\frac{X_{1,m}}{m} - \frac{X_{1,m+1}}{m+1}\right]^2 m(m+1)\\
                &=& \sigma_{1,m} + 
                        \frac {[~X_{1,m} - m x_{m+1}~]^2}{m(m+1)}
\label{eqn:simplevar0}
\eea
where we have used the relation
\beq
     X_{1,m+1} ~=~  X_{1,m} + x_{m+1}                       
\label{eqn:simplevar1}
\eeq
Using formulae~(\eqnref{simplevar0}) and ~(\eqnref{simplevar1}) the average 
\beq
\left<x\right> ~=~ \frac{X_{1,N_x}}{N_x}
\eeq
and the fluctuation 
\beq
\left<(\Delta x)^2\right>^{\half} = \left[\frac {\sigma_{1,N_x}}{N_x}\right]^{\half}
\eeq
can be obtained by one sweep through the data. 

\section{Implementation}
In {\gromacs} the instantaneous
energies $E(m)$ are stored in the \swapindex{energy}{file}, along with the 
values of $\sigma_{1,m}$ and $X_{1,m}$. Although the steps are counted from 0,
for the energy and fluctuations steps are counted from 1. This means that the
equations presented here are the ones that are implemented.
We give somewhat lengthy derivations in this section
to simplify checking of code and equations later on.

\subsection{Part of a Simulation}
It is not uncommon to perform a simulation where the first part,
{\eg} 100 ps, is taken as \normindex{equilibration}. However, the
averages and fluctuations as printed in the \swapindex{log}{file}
are computed over the whole simulation. The equilibration time,
which is now part of the simulation, may in such a case invalidate the
averages and fluctuations, because these numbers are now dominated
by the initial drift towards equilibrium.

Using~\eqnsref{Xpartial}{varpartial} the average and 
standard deviation over part of the trajectory can be computed as:
\bea
X_{m+1,m+k}     &=& X_{1,m+k} - X_{1,m}                 \\
\sigma_{m+1,m+k} &=& \sigma_{1,m+k}-\sigma_{1,m} - \left[~\frac{X_{1,m}}{m} - \frac{X_{1,m+k}}{m+k}~\right]^{2}~ \frac{m(m+k)}{k}
\eea

or, more generally (with $p \geq 1$ and $q \geq p$):
\bea
X_{p,q}         &=&     X_{1,q} - X_{1,p-1}     \\
\sigma_{p,q}    &=&     \sigma_{1,q}-\sigma_{1,p-1} - \left[~\frac{X_{1,p-1}}{p-1} - \frac{X_{1,q}}{q}~\right]^{2}~ \frac{(p-1)q}{q-p+1}
\eea
{\bf Note} that implementation of this is not entirely trivial, since energies
are not stored every time step of the simulation. We therefore have to construct
$X_{1,p-1}$ and $\sigma_{1,p-1}$ from the information at time $p$ using
\eqnsref{simplevar0}{simplevar1}:
\bea
X_{1,p-1}       &=&     X_{1,p} - x_p   \\
\sigma_{1,p-1}  &=&     \sigma_{1,p} -  \frac {[~X_{1,p-1} - (p-1) x_{p}~]^2}{(p-1)p}
\eea

\subsection{Combining two simulations}
Another frequently occurring problem is, that the fluctuations of two simulations
must be combined. Consider the following example: we have two simulations
(A) of $n$ and (B) of $m$ steps, in which the second simulation is a 
continuation of the first. However, the second simulation starts numbering from 1
instead of from $n+1$. For the partial sum
this is no problem, we have to add $X_{1,n}^A$ from run A:
\beq
X_{1,n+m}^{AB} ~=~ X_{1,n}^A + X_{1,m}^B
\label{eqn:pscomb}
\eeq
When we want to compute the partial variance from the two components we have to 
make a correction $\Delta\sigma$:
\beq
\sigma_{1,n+m}^{AB} ~=~ \sigma_{1,n}^A + \sigma_{1,m}^B +\Delta\sigma
\eeq
if we define $x_i^{AB}$ as the combined and renumbered set of data points we can 
write:
\beq
\sigma_{1,n+m}^{AB} ~=~ \sum_{i=1}^{n+m}  \left[x_i^{AB} - \frac{X_{1,n+m}^{AB}}{n+m}\right]^2  
\eeq
and thus
\beq
\sum_{i=1}^{n+m}  \left[x_i^{AB} - \frac{X_{1,n+m}^{AB}}{n+m}\right]^2  ~=~
\sum_{i=1}^{n}  \left[x_i^{A} - \frac{X_{1,n}^{A}}{n}\right]^2  +
\sum_{i=1}^{m}  \left[x_i^{B} - \frac{X_{1,m}^{B}}{m}\right]^2  +\Delta\sigma
\eeq
or
\bea
\sum_{i=1}^{n+m}  \left[(x_i^{AB})^2 - 2 x_i^{AB}\frac{X^{AB}_{1,n+m}}{n+m} + \left(\frac{X^{AB}_{1,n+m}}{n+m}\right)^2  \right] &-& \nonumber \\
\sum_{i=1}^{n}  \left[(x_i^{A})^2 - 2 x_i^{A}\frac{X^A_{1,n}}{n} + \left(\frac{X^A_{1,n}}{n}\right)^2  \right] &-& \nonumber \\
\sum_{i=1}^{m}  \left[(x_i^{B})^2 - 2 x_i^{B}\frac{X^B_{1,m}}{m} + \left(\frac{X^B_{1,m}}{m}\right)^2  \right] &=& \Delta\sigma
\eea
all the $x_i^2$ terms drop out, and the terms independent of the summation
counter $i$ can be simplified:
\bea
\frac{\left(X^{AB}_{1,n+m}\right)^2}{n+m} \,-\, 
\frac{\left(X^A_{1,n}\right)^2}{n} \,-\, 
\frac{\left(X^B_{1,m}\right)^2}{m} &-& \nonumber \\
2\,\frac{X^{AB}_{1,n+m}}{n+m}\sum_{i=1}^{n+m}x_i^{AB} \,+\,
2\,\frac{X^{A}_{1,n}}{n}\sum_{i=1}^{n}x_i^{A} \,+\,
2\,\frac{X^{B}_{1,m}}{m}\sum_{i=1}^{m}x_i^{B} &=& \Delta\sigma
\eea
we recognize the three partial sums on the second line and use \eqnref{pscomb}
to obtain:
\beq
\Delta\sigma ~=~ \frac{\left(mX^A_{1,n} - nX^B_{1,m}\right)^2}{nm(n+m)}
\eeq
if we check this by inserting $m=1$ we get back \eqnref{simplevar0}

\subsection{Summing energy terms}
The {\tt \normindex{g_energy}} program can also sum energy terms into one, {\eg} 
potential + kinetic = total. For the partial averages this is again easy
if we have $S$ energy components $s$:
\beq
X_{m,n}^S ~=~ \sum_{i=m}^n \sum_{s=1}^S x_i^s ~=~ \sum_{s=1}^S \sum_{i=m}^n x_i^s ~=~ \sum_{s=1}^S X_{m,n}^s
\label{eqn:sumterms}
\eeq
For the fluctuations it is less trivial again, considering for example 
that the fluctuation in potential and kinetic energy should cancel. 
Nevertheless we can try the same approach as before by writing:
\beq
\sigma_{m,n}^S ~=~ \sum_{s=1}^S \sigma_{m,n}^s + \Delta\sigma
\eeq
if we fill in \eqnref{sigma}:
\beq
\sum_{i=m}^n \left[\left(\sum_{s=1}^S x_i^s\right) - \frac{X_{m,n}^S}{m-n+1}\right]^2 ~=~
\sum_{s=1}^S \sum_{i=m}^n \left[\left(x_i^s\right) - \frac{X_{m,n}^s}{m-n+1}\right]^2 + \Delta\sigma
\label{eqn:sigmaterms}
\eeq
which we can expand to:
\bea
&~&\sum_{i=m}^n \left[\sum_{s=1}^S (x_i^s)^2 + \left(\frac{X_{m,n}^S}{m-n+1}\right)^2 -2\left(\frac{X_{m,n}^S}{m-n+1}\sum_{s=1}^S x_i^s + \sum_{s=1}^S \sum_{s'=s+1}^S x_i^s x_i^{s'} \right)\right]    \nonumber \\
&-&\sum_{s=1}^S \sum_{i=m}^n \left[(x_i^s)^2 - 2\,\frac{X_{m,n}^s}{m-n+1}\,x_i^s + \left(\frac{X_{m,n}^s}{m-n+1}\right)^2\right] ~=~\Delta\sigma 
\eea
the terms with $(x_i^s)^2$ cancel, so that we can simplify to:
\bea
&~&\frac{\left(X_{m,n}^S\right)^2}{m-n+1} -2 \frac{X_{m,n}^S}{m-n+1}\sum_{i=m}^n\sum_{s=1}^S x_i^s -2\sum_{i=m}^n\sum_{s=1}^S \sum_{s'=s+1}^S x_i^s x_i^{s'}\, -        \nonumber \\
&~&\sum_{s=1}^S \sum_{i=m}^n \left[- 2\,\frac{X_{m,n}^s}{m-n+1}\,x_i^s + \left(\frac{X_{m,n}^s}{m-n+1}\right)^2\right] ~=~\Delta\sigma 
\eea
or
\beq
-\frac{\left(X_{m,n}^S\right)^2}{m-n+1}  -2\sum_{i=m}^n\sum_{s=1}^S \sum_{s'=s+1}^S x_i^s x_i^{s'}\, +  \sum_{s=1}^S \frac{\left(X_{m,n}^s\right)^2}{m-n+1}  ~=~\Delta\sigma 
\eeq
If we now expand the first term using \eqnref{sumterms} we obtain:
\beq
-\frac{\left(\sum_{s=1}^SX_{m,n}^s\right)^2}{m-n+1}  -2\sum_{i=m}^n\sum_{s=1}^S \sum_{s'=s+1}^S x_i^s x_i^{s'}\, +      \sum_{s=1}^S \frac{\left(X_{m,n}^s\right)^2}{m-n+1}  ~=~\Delta\sigma 
\eeq
which we can reformulate to:
\beq
-2\left[\sum_{s=1}^S \sum_{s'=s+1}^S X_{m,n}^s X_{m,n}^{s'}\,+\sum_{i=m}^n\sum_{s=1}^S \sum_{s'=s+1}^S x_i^s x_i^{s'}\right] ~=~\Delta\sigma 
\eeq
or
\beq
-2\left[\sum_{s=1}^S X_{m,n}^s \sum_{s'=s+1}^S X_{m,n}^{s'}\,+\,\sum_{s=1}^S \sum_{i=m}^nx_i^s \sum_{s'=s+1}^S x_i^{s'}\right] ~=~\Delta\sigma 
\eeq
which gives
\beq
-2\sum_{s=1}^S \left[X_{m,n}^s \sum_{s'=s+1}^S \sum_{i=m}^n x_i^{s'}\,+\,\sum_{i=m}^n x_i^s \sum_{s'=s+1}^S x_i^{s'}\right] ~=~\Delta\sigma 
\eeq
Since we need all data points $i$ to evaluate this, in general this is not possible.
We can then make an estimate of $\sigma_{m,n}^S$ using only the data points that 
are available using the left hand side of \eqnref{sigmaterms}. While the average can
be computed using all time steps in the simulation, the accuracy of the
fluctuations is thus limited by the frequency with which energies are saved.
Since this can be easily done with a program such as \normindex{xmgr} this is not 
built-in in {\gromacs}.

% LocalWords:  Formulae varpartial formulae simplevar Xpartial pscomb mX nX SX
% LocalWords:  sumterms nx sigmaterms xmgr ps nm