From 26fc7c7e6aff3f205ea9ed6d934be04c2a4ce715 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 26 May 2015 00:03:10 -0700
Subject: [PATCH 1/7] Rewrote cloverDerivative to utilize gauge::FloatNOrder,
 adding support for 12-reconstruct for the gauge field.

---
 lib/clover_deriv_quda.cu | 614 ++++++++++++++++++++-------------------
 1 file changed, 312 insertions(+), 302 deletions(-)
diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 23dff8ba6c..f9a51c1b40 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -4,49 +4,41 @@
 #include <quda_internal.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
+#include <gauge_field_order.h>
 #include <quda_matrix.h>
 #include <cassert>
 
 namespace quda {
-
+  
 #ifdef GPU_CLOVER_DIRAC
-
-  template<class Cmplx>
-    struct CloverDerivArg
+  
+  template<class Cmplx, typename Force, typename Gauge, typename Oprod>
+  struct CloverDerivArg
+  {
+    int X[4];
+    int border[4];
+    int mu;
+    int nu;
+    typename RealTypeId<Cmplx>::Type coeff;
+    int parity;
+    int volumeCB;
+
+    Force force;
+    Gauge gauge;
+    Oprod oprod;
+
+    bool conjugate;      
+
+    CloverDerivArg(const Force& force, const Gauge& gauge, const Oprod& oprod, const int *X, 
+		   int mu, int nu, double coeff, int parity, bool conjugate) :  
+      mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.volumeCB), 
+      force(force), gauge(gauge), oprod(oprod), conjugate(conjugate)
     {
-      int X[4];
-      int border[4];
-      int mu;
-      int nu;
-      typename RealTypeId<Cmplx>::Type coeff;
-      int parity;
-      int volumeCB;
-
-      Cmplx* gauge;
-      Cmplx* force;
-      Cmplx* oprod;
-
-      int forceStride;
-      int gaugeStride;
-      int oprodStride;
-
-      int forceOffset;
-      int gaugeOffset;
-      int oprodOffset;
-     
-      bool conjugate;      
-
-      CloverDerivArg(cudaGaugeField& force, cudaGaugeField& gauge, cudaGaugeField& oprod, int mu, int nu, double coeff, int parity, bool conjugate) :  
-        mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.VolumeCB()), 
-        force(reinterpret_cast<Cmplx*>(force.Gauge_p())),  gauge(reinterpret_cast<Cmplx*>(gauge.Gauge_p())), oprod(reinterpret_cast<Cmplx*>(oprod.Gauge_p())),
-        forceStride(force.Stride()), gaugeStride(gauge.Stride()), oprodStride(oprod.Stride()),
-        forceOffset(force.Bytes()/(2*sizeof(Cmplx))), gaugeOffset(gauge.Bytes()/(2*sizeof(Cmplx))), oprodOffset(oprod.Bytes()/(2*sizeof(Cmplx)))
-      {
-        for(int dir=0; dir<4; ++dir) X[dir] = force.X()[dir];
-        //for(int dir=0; dir<4; ++dir) border[dir] =  commDimPartitioned(dir) ? 2 : 0;
-        for(int dir=0; dir<4; ++dir) border[dir] = 2;
-      }
-    };
+      for(int dir=0; dir<4; ++dir) this->X[dir] = X[dir];
+      //for(int dir=0; dir<4; ++dir) border[dir] =  commDimPartitioned(dir) ? 2 : 0;
+      for(int dir=0; dir<4; ++dir) border[dir] = 2;
+    }
+  };
 
   __device__ void getCoords(int x[4], int cb_index, const int X[4], int parity)
   {
@@ -54,7 +46,6 @@ namespace quda {
     x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
     x[1] = (cb_index/(X[0]/2)) % X[1];
     x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
-
     return;
   }
 
@@ -64,248 +55,235 @@ namespace quda {
     for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
     return (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0])/2;
   }
+  
+  
+  template<typename Cmplx, bool isConjugate, typename Arg>
+  __global__ void 
+  cloverDerivativeKernel(Arg arg)
+  {
+    typedef typename RealTypeId<Cmplx>::Type real;
 
+    int index = threadIdx.x + blockIdx.x*blockDim.x;
 
-  template<typename Cmplx, bool isConjugate>
-    __global__ void 
-    cloverDerivativeKernel(const CloverDerivArg<Cmplx> arg)
-    {
-      int index = threadIdx.x + blockIdx.x*blockDim.x;
-
-      if(index >= arg.volumeCB) return;
-
-
-      int x[4];
-      int y[4];
-      int otherparity = (1-arg.parity);
-      getCoords(x, index, arg.X, arg.parity);
-      getCoords(y, index, arg.X, otherparity);
-      int X[4]; 
-      for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
-
-      for(int dir=0; dir<4; ++dir){
-        x[dir] += arg.border[dir];
-        y[dir] += arg.border[dir];
-        X[dir] += 2*arg.border[dir];
-      }
-
-      Cmplx* thisGauge = arg.gauge + arg.parity*arg.gaugeOffset;
-      Cmplx* otherGauge = arg.gauge + (otherparity)*arg.gaugeOffset;
-
-      Cmplx* thisOprod = arg.oprod + arg.parity*arg.oprodOffset;
+    if(index >= arg.volumeCB) return;
 
-      const int& mu = arg.mu;
-      const int& nu = arg.nu;
+    int x[4];
+    int y[4];
+    int otherparity = (1-arg.parity);
+    getCoords(x, index, arg.X, arg.parity);
+    getCoords(y, index, arg.X, otherparity);
+    int X[4]; 
+    for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
+
+    for(int dir=0; dir<4; ++dir){
+      x[dir] += arg.border[dir];
+      y[dir] += arg.border[dir];
+      X[dir] += 2*arg.border[dir];
+    }
 
-      Matrix<Cmplx,3> thisForce;
-      Matrix<Cmplx,3> otherForce;
+    const int& mu = arg.mu;
+    const int& nu = arg.nu;
 
-      // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
-      {
-        int d[4] = {0, 0, 0, 0};
+    Matrix<Cmplx,3> thisForce;
+    Matrix<Cmplx,3> otherForce;
 
-        // load U(x)_(+mu)
-        Matrix<Cmplx,3> U1;
-        loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X), 
-            arg.gaugeStride, &U1);
+    // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
+    {
+      int d[4] = {0, 0, 0, 0};
 
+      // load U(x)_(+mu)
+      Matrix<Cmplx,3> U1;
+      arg.gauge.load((real*)(U1.data), linkIndex(x, d, X), mu, arg.parity);
 
-        // load U(x+mu)_(+nu)
-        Matrix<Cmplx,3> U2;
-        d[mu]++;
-        loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X), 
-            arg.gaugeStride, &U2);
-        d[mu]--;
+      // load U(x+mu)_(+nu)
+      Matrix<Cmplx,3> U2;
+      d[mu]++;
+      arg.gauge.load((real*)(U2.data), linkIndex(x, d, X), nu, otherparity);
+      d[mu]--;
 
 
-        // load U(x+nu)_(+mu) 
-        Matrix<Cmplx,3> U3;
-        d[nu]++;
-        loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),
-            arg.gaugeStride, &U3);
-        d[nu]--;
+      // load U(x+nu)_(+mu) 
+      Matrix<Cmplx,3> U3;
+      d[nu]++;
+      arg.gauge.load((real*)(U3.data), linkIndex(x, d, X), mu, otherparity);
+      d[nu]--;
       
-        // load U(x)_(+nu)
-        Matrix<Cmplx,3> U4;
-        loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),
-            arg.gaugeStride, &U4);
-
-        // load Oprod
-        Matrix<Cmplx,3> Oprod1;
-        loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);
+      // load U(x)_(+nu)
+      Matrix<Cmplx,3> U4;
+      arg.gauge.load((real*)(U4.data), linkIndex(x, d, X), nu, arg.parity);
 
-        if(isConjugate) Oprod1 -= conj(Oprod1);
-        thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1;
+      // load Oprod
+      Matrix<Cmplx,3> Oprod1;
+      arg.oprod.load((real*)(Oprod1.data), linkIndex(x, d, X), 0, arg.parity);
 
-        Matrix<Cmplx,3> Oprod2;
-        d[mu]++; d[nu]++;
-        loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod2);
-        d[mu]--; d[nu]--;
+      if(isConjugate) Oprod1 -= conj(Oprod1);
+      thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1;
 
-        if(isConjugate) Oprod2 -= conj(Oprod2);
+      Matrix<Cmplx,3> Oprod2;
+      d[mu]++; d[nu]++;
+      arg.oprod.load((real*)(Oprod2.data), linkIndex(x, d, X), 0, arg.parity);
+      d[mu]--; d[nu]--;
 
-        thisForce += U1*U2*Oprod2*conj(U3)*conj(U4);
+      if(isConjugate) Oprod2 -= conj(Oprod2);
 
-      } 
+      thisForce += U1*U2*Oprod2*conj(U3)*conj(U4);
+    } 
  
-      { 
-        int d[4] = {0, 0, 0, 0};
-        // load U(x)_(+mu)
-        Matrix<Cmplx,3> U1;
-        loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),
-            arg.gaugeStride, &U1);
-
-        // load U(x+mu)_(+nu)
-        Matrix<Cmplx,3> U2;
-        d[mu]++;
-        loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),
-            arg.gaugeStride, &U2);
-        d[mu]--;
-
-        // load U(x+nu)_(+mu) 
-        Matrix<Cmplx,3> U3;
-        d[nu]++;
-        loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),
-            arg.gaugeStride, &U3);
-        d[nu]--;
-
-        // load U(x)_(+nu)
-        Matrix<Cmplx,3> U4;
-        loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),
-            arg.gaugeStride, &U4);
-
-        // load opposite parity Oprod
-        Matrix<Cmplx,3> Oprod3;
-        d[nu]++;
-        loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod3);
-        d[nu]--;
-
-        if(isConjugate) Oprod3 -= conj(Oprod3);
-        otherForce = U1*U2*conj(U3)*Oprod3*conj(U4);
-
-        // load Oprod(x+mu)
-        Matrix<Cmplx, 3> Oprod4;
-        d[mu]++;
-        loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod4);
-        d[mu]--;
-
-        if(isConjugate) Oprod4 -= conj(Oprod4);
-
-        otherForce += U1*Oprod4*U2*conj(U3)*conj(U4);
-      }
+    { 
+      int d[4] = {0, 0, 0, 0};
+      // load U(x)_(+mu)
+      Matrix<Cmplx,3> U1;
+      arg.gauge.load((real*)(U1.data), linkIndex(y, d, X), mu, otherparity);
+
+      // load U(x+mu)_(+nu)
+      Matrix<Cmplx,3> U2;
+      d[mu]++;
+      arg.gauge.load((real*)(U2.data), linkIndex(y, d, X), nu, arg.parity);
+      d[mu]--;
+
+      // load U(x+nu)_(+mu) 
+      Matrix<Cmplx,3> U3;
+      d[nu]++;
+      arg.gauge.load((real*)(U3.data), linkIndex(y, d, X), mu, arg.parity);
+      d[nu]--;
+
+      // load U(x)_(+nu)
+      Matrix<Cmplx,3> U4;
+      arg.gauge.load((real*)(U4.data), linkIndex(y, d, X), nu, otherparity);
+
+      // load opposite parity Oprod
+      Matrix<Cmplx,3> Oprod3;
+      d[nu]++;
+      arg.oprod.load((real*)(Oprod3.data), linkIndex(y, d, X), 0, arg.parity);
+      d[nu]--;
+
+      if(isConjugate) Oprod3 -= conj(Oprod3);
+      otherForce = U1*U2*conj(U3)*Oprod3*conj(U4);
+
+      // load Oprod(x+mu)
+      Matrix<Cmplx, 3> Oprod4;
+      d[mu]++;
+      arg.oprod.load((real*)(Oprod4.data), linkIndex(y, d, X), 0, arg.parity);
+      d[mu]--;
+
+      if(isConjugate) Oprod4 -= conj(Oprod4);
+
+      otherForce += U1*Oprod4*U2*conj(U3)*conj(U4);
+    }
 
 
-      // Lower leaf
-      // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
-      {
-        int d[4] = {0, 0, 0, 0};
-        // load U(x-nu)(+nu)
-        Matrix<Cmplx,3> U1;
-        d[nu]--;
-        loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),
-            arg.gaugeStride, &U1);
-        d[nu]++;
-
-        // load U(x-nu)(+mu) 
-        Matrix<Cmplx, 3> U2;
-        d[nu]--;
-        loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),
-            arg.gaugeStride, &U2);
-        d[nu]++;
-
-        // load U(x+mu-nu)(nu)
-        Matrix<Cmplx, 3> U3;
-        d[mu]++; d[nu]--;
-        loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),
-            arg.gaugeStride, &U3);
-        d[mu]--; d[nu]++;
-
-        // load U(x)_(+mu)
-        Matrix<Cmplx,3> U4;
-        loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),
-            arg.gaugeStride, &U4);
-
-        // load Oprod(x+mu)
-        Matrix<Cmplx, 3> Oprod1;
-        d[mu]++;
-        loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod1);
-        d[mu]--;    
-
-        if(isConjugate) Oprod1 -= conj(Oprod1);
-
-        otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4);
-
-        Matrix<Cmplx,3> Oprod2;
-        d[nu]--;
-        loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod2);
-        d[nu]++;
-
-        if(isConjugate) Oprod2 -= conj(Oprod2);
-        otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4);
-      }
+    // Lower leaf
+    // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
+    {
+      int d[4] = {0, 0, 0, 0};
+      // load U(x-nu)(+nu)
+      Matrix<Cmplx,3> U1;
+      d[nu]--;
+      arg.gauge.load((real*)(U1.data), linkIndex(y, d, X), nu, arg.parity);
+      d[nu]++;
+
+      // load U(x-nu)(+mu) 
+      Matrix<Cmplx, 3> U2;
+      d[nu]--;
+      arg.gauge.load((real*)(U2.data), linkIndex(y, d, X), mu, arg.parity);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      Matrix<Cmplx, 3> U3;
+      d[mu]++; d[nu]--;
+      arg.gauge.load((real*)(U3.data), linkIndex(y, d, X), nu, otherparity);
+      d[mu]--; d[nu]++;
+
+      // load U(x)_(+mu)
+      Matrix<Cmplx,3> U4;
+      arg.gauge.load((real*)(U4.data), linkIndex(y, d, X), mu, otherparity);
+
+      // load Oprod(x+mu)
+      Matrix<Cmplx, 3> Oprod1;
+      d[mu]++;
+      arg.oprod.load((real*)(Oprod1.data), linkIndex(y, d, X), 0, arg.parity);
+      d[mu]--;    
+
+      if(isConjugate) Oprod1 -= conj(Oprod1);
+
+      otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4);
+
+      Matrix<Cmplx,3> Oprod2;
+      d[nu]--;
+      arg.oprod.load((real*)(Oprod2.data), linkIndex(y, d, X), 0, arg.parity);
+      d[nu]++;
+
+      if(isConjugate) Oprod2 -= conj(Oprod2);
+      otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4);
+    }
 
-      {
-        int d[4] = {0, 0, 0, 0};
-        // load U(x-nu)(+nu)
-        Matrix<Cmplx,3> U1;
-        d[nu]--;
-        loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X), 
-            arg.gaugeStride, &U1);
-        d[nu]++;
-
-        // load U(x-nu)(+mu) 
-        Matrix<Cmplx, 3> U2;
-        d[nu]--;
-        loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),
-            arg.gaugeStride, &U2);
-        d[nu]++;
-
-        // load U(x+mu-nu)(nu)
-        Matrix<Cmplx, 3> U3;
-        d[mu]++; d[nu]--;
-        loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),
-            arg.gaugeStride, &U3);
-        d[mu]--; d[nu]++;
-
-        // load U(x)_(+mu)
-        Matrix<Cmplx,3> U4;
-        loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X),
-            arg.gaugeStride, &U4);
-
-
-        Matrix<Cmplx,3> Oprod1;
-        d[mu]++; d[nu]--;
-        loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);
-        d[mu]--; d[nu]++;
-
-        if(isConjugate) Oprod1 -= conj(Oprod1);
-        thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4);
-
-        Matrix<Cmplx, 3> Oprod4;
-        loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod4);
-
-        if(isConjugate) Oprod4 -= conj(Oprod4);
-        thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4);
-      }
+    {
+      int d[4] = {0, 0, 0, 0};
+      // load U(x-nu)(+nu)
+      Matrix<Cmplx,3> U1;
+      d[nu]--;
+      arg.gauge.load((real*)(U1.data), linkIndex(x, d, X), nu, otherparity);
+      d[nu]++;
+	
+      // load U(x-nu)(+mu) 
+      Matrix<Cmplx, 3> U2;
+      d[nu]--;
+      arg.gauge.load((real*)(U2.data), linkIndex(x, d, X), mu, otherparity);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      Matrix<Cmplx, 3> U3;
+      d[mu]++; d[nu]--;
+      arg.gauge.load((real*)(U3.data), linkIndex(x, d, X), nu, arg.parity);
+      d[mu]--; d[nu]++;
+
+      // load U(x)_(+mu)
+      Matrix<Cmplx,3> U4;
+      arg.gauge.load((real*)(U4.data), linkIndex(x, d, X), mu, arg.parity);
+
+      Matrix<Cmplx,3> Oprod1;
+      d[mu]++; d[nu]--;
+      arg.oprod.load((real*)(Oprod1.data), linkIndex(x, d, X), 0, arg.parity);
+      d[mu]--; d[nu]++;
+
+      if(isConjugate) Oprod1 -= conj(Oprod1);
+      thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4);
+
+      Matrix<Cmplx, 3> Oprod4;
+      arg.oprod.load((real*)(Oprod4.data), linkIndex(x, d, X), 0, arg.parity);
+
+      if(isConjugate) Oprod4 -= conj(Oprod4);
+      thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4);
+    }
     
-      thisForce *= arg.coeff;
-      otherForce *= arg.coeff;
+    thisForce *= arg.coeff;
+    otherForce *= arg.coeff;
 
 
-      // Write to array
-      {
-        appendMatrixToArray(thisForce, index, arg.forceStride, arg.force + arg.parity*arg.forceOffset);
-        appendMatrixToArray(otherForce, index, arg.forceStride, arg.force + otherparity*arg.forceOffset); 
-      }
-      return;
-    } // cloverDerivativeKernel
-
-
-  template<typename Complex>
+    // Write to array
+    {
+      Matrix<Cmplx, 3> F;
+      arg.force.load((real*)(F.data), index, 0, arg.parity);
+      F += thisForce;
+      arg.force.save((real*)(F.data), index, 0, arg.parity);
+    }
+      
+    {
+      Matrix<Cmplx, 3> F;
+      arg.force.load((real*)(F.data), index, 0, otherparity);
+      F += otherForce;
+      arg.force.save((real*)(F.data), index, 0, otherparity);
+    }
+      
+    return;
+  } // cloverDerivativeKernel
+  
+  
+  template<typename Complex, typename Arg>
   class CloverDerivative : public Tunable {
-
+    
   private:
-    CloverDerivArg<Complex> arg;
+    Arg arg;
     const GaugeField &meta;
 
     unsigned int sharedBytesPerThread() const { return 0; }
@@ -315,14 +293,15 @@ namespace quda {
     bool tuneGridDim() const { return false; }
 
   public:
-    CloverDerivative(const CloverDerivArg<Complex> &arg, const GaugeField &meta)
+    CloverDerivative(const Arg &arg, const GaugeField &meta)
       : arg(arg), meta(meta) {
-      writeAuxString("threads=%d,prec=%lu,stride=%d,geometery=%d",arg.volumeCB,sizeof(Complex)/2,arg.forceOffset);
+      writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
+		     arg.volumeCB,sizeof(Complex)/2,arg.force.stride,arg.gauge.stride,arg.oprod.stride);
     }
     virtual ~CloverDerivative() {}
 
     void apply(const cudaStream_t &stream){
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
       if(arg.conjugate){
 	cloverDerivativeKernel<Complex,true><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
       }else{
@@ -330,65 +309,96 @@ namespace quda {
       }
     } // apply
 
-    void preTune(){}
-    void postTune(){}
+    void preTune() {} // FIXME
+    void postTune(){} // FIXME
 
-    long long flops() const {
-      return 0;
-    }
+    long long flops() const { return 0; }
 
     long long bytes() const { return 0; }
 
     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
   };
 
-
-  // FIXME - the Tunable class isn't used here
+  
   template<typename Float>
-    void cloverDerivative(cudaGaugeField &out,
-        cudaGaugeField& gauge,
-        cudaGaugeField& oprod,
-        int mu, int nu, double coeff, int parity,
-        int conjugate)
-    {
-      typedef typename ComplexTypeId<Float>::Type Complex;
-      CloverDerivArg<Complex> arg(out, gauge, oprod, mu, nu, coeff, parity, conjugate);
-//      CloverDerivative<Complex> cloverDerivative(arg);
-//      cloverDerivative.apply(0);
-      dim3 blockDim(128, 1, 1);
-      dim3 gridDim((arg.volumeCB + blockDim.x-1)/blockDim.x, 1, 1);
-      if(conjugate){
-        cloverDerivativeKernel<Complex,true><<<gridDim,blockDim,0>>>(arg);
-      }else{
-        cloverDerivativeKernel<Complex,false><<<gridDim,blockDim,0>>>(arg);
+  void cloverDerivative(cudaGaugeField &out,
+			cudaGaugeField &gauge,
+			cudaGaugeField &oprod,
+			int mu, int nu, double coeff, int parity,
+			int conjugate) {
+ 
+    if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) 
+      errorQuda("Force field does not support reconstruction");
+    
+    if (out.Order() != oprod.Order()) 
+      errorQuda("Force and Oprod orders must match");
+    
+    if (out.Reconstruct() != QUDA_RECONSTRUCT_NO) 
+      errorQuda("Force field does not support reconstruction");
+  
+    typedef typename ComplexTypeId<Float>::Type Complex;
+
+    if (out.Order() == QUDA_FLOAT2_GAUGE_ORDER){
+      typedef FloatNOrder<Float, 18, 2, 18> F;
+      typedef FloatNOrder<Float, 18, 2, 18> O;
+
+      if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){
+	if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){
+	  typedef FloatNOrder<Float, 18, 2, 18> G;
+	  typedef CloverDerivArg<Complex,F,G,O> Arg;
+	  Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate);
+	  CloverDerivative<Complex, Arg> deriv(arg, gauge);
+	  deriv.apply(0);
+	}else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
+	  typedef FloatNOrder<Float, 18, 2, 12> G;
+	  typedef CloverDerivArg<Complex,F,G,O> Arg;
+	  Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate);
+	  CloverDerivative<Complex, Arg> deriv(arg, gauge);
+	  deriv.apply(0);
+	}else{
+	  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
+	}
+	
+      }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){
+	if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
+	  typedef FloatNOrder<Float, 18, 4, 12> G;
+	  typedef CloverDerivArg<Complex,F,G,O> Arg;
+	  Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate);
+	  CloverDerivative<Complex, Arg> deriv(arg, gauge);
+	  deriv.apply(0);
+	}else{
+	  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
+	}
       }
-    }    
-
-#endif
+    } else {
+      errorQuda("Force order %d not supported");
+    } // force / oprod order
+  }
+#endif // GPU_CLOVER
 
-  void cloverDerivative(cudaGaugeField &out,   
-      cudaGaugeField& gauge,
-      cudaGaugeField& oprod,
-      int mu, int nu, double coeff, QudaParity parity, int conjugate)
-  {
+void cloverDerivative(cudaGaugeField &out,   
+		      cudaGaugeField& gauge,
+		      cudaGaugeField& oprod,
+		      int mu, int nu, double coeff, QudaParity parity, int conjugate)
+{
 #ifdef GPU_CLOVER_DIRAC
-    assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY);
-    assert(out.Geometry() == QUDA_SCALAR_GEOMETRY);
+  assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY);
+  assert(out.Geometry() == QUDA_SCALAR_GEOMETRY);
 
-    int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
+  int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
 
-    if(out.Precision() == QUDA_DOUBLE_PRECISION){
-      cloverDerivative<double>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);   
-    } else if (out.Precision() == QUDA_SINGLE_PRECISION){
-      cloverDerivative<float>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);
-    } else {
-      errorQuda("Precision %d not supported", out.Precision());
-    }
-    return;
+  if(out.Precision() == QUDA_DOUBLE_PRECISION){
+    cloverDerivative<double>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);   
+  } else if (out.Precision() == QUDA_SINGLE_PRECISION){
+    cloverDerivative<float>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);
+  } else {
+    errorQuda("Precision %d not supported", out.Precision());
+  }
+  return;
 #else
-    errorQuda("Clover has not been built");
+  errorQuda("Clover has not been built");
 #endif
-  }              
+}              
 
 
 } // namespace quda

From aea5fbec645ccc55a6102bbb8ebc060b12f9b65b Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 26 May 2015 22:08:46 -0700
Subject: [PATCH 2/7] Added save/load routines to gauge::FloatNOrder necessary
 for saving and restoring state to enable autotuning for kernels that destroy
 input data.

---
 include/gauge_field_order.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 84c88d21ef..81c23d2faf 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -444,13 +444,15 @@ namespace quda {
 #if __COMPUTE_CAPABILITY__ >= 200
       const int hasPhase; 
       const size_t phaseOffset;
+      void *backup_h; //! host memory for backing up the field when tuning
+      size_t bytes;
 #endif
 
     FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : 
       reconstruct(u), volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry())
 #if __COMPUTE_CAPABILITY__ >= 200
 	, hasPhase((u.Reconstruct() == QUDA_RECONSTRUCT_9 || u.Reconstruct() == QUDA_RECONSTRUCT_13) ? 1 : 0), 
-	phaseOffset(u.PhaseOffset())
+	phaseOffset(u.PhaseOffset()), backup_h(0), bytes(u.Bytes())
 #endif
       {
 	if (gauge_) { gauge[0] = gauge_; gauge[1] = (Float*)((char*)gauge_ + u.Bytes()/2);
@@ -467,7 +469,7 @@ namespace quda {
     : reconstruct(order.reconstruct), volumeCB(order.volumeCB), stride(order.stride), 
 	geometry(order.geometry) 
 #if __COMPUTE_CAPABILITY__ >= 200
-	, hasPhase(order.hasPhase), phaseOffset(order.phaseOffset) 
+	, hasPhase(order.hasPhase), phaseOffset(order.phaseOffset), backup_h(0), bytes(order.bytes)
 #endif
       {
 	gauge[0] = order.gauge[0];
@@ -618,6 +620,30 @@ namespace quda {
 	}
       }
 
+      /**
+	 used to backup the field to the host when tuning
+      */
+      void save() {
+#if __COMPUTE_CAPABILITY__ >= 200
+	if (backup_h) errorQuda("Already allocated host backup");
+	backup_h = safe_malloc(bytes);
+	cudaMemcpy(backup_h, gauge[0], bytes, cudaMemcpyDeviceToHost);
+	checkCudaError();
+#endif
+      }
+      
+      /**
+	 restore the field from the host after tuning
+      */
+      void load() {
+#if __COMPUTE_CAPABILITY__ >= 200
+	cudaMemcpy(gauge[0], backup_h, bytes, cudaMemcpyHostToDevice);
+	host_free(backup_h);
+	backup_h = 0;
+	checkCudaError();
+#endif
+      }
+
       size_t Bytes() const { return reconLen * sizeof(Float); }
     };
 

From f9a9757bfc660c8f8162ad39ca01ab684603b3b6 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 26 May 2015 22:12:30 -0700
Subject: [PATCH 3/7] Added tuning for clover derivative computation.

---
 lib/clover_deriv_quda.cu | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index f9a51c1b40..ba78011939 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -301,7 +301,11 @@ namespace quda {
     virtual ~CloverDerivative() {}
 
     void apply(const cudaStream_t &stream){
+#if __COMPUTE_CAPABILITY__ >= 200
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+#else // tuning not supported on Tesla architecture
       TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
+#endif
       if(arg.conjugate){
 	cloverDerivativeKernel<Complex,true><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
       }else{
@@ -309,12 +313,14 @@ namespace quda {
       }
     } // apply
 
-    void preTune() {} // FIXME
-    void postTune(){} // FIXME
+    // The force field is updated so we must preserve its initial state
+#if __COMPUTE_CAPABILITY__ >= 200
+    void preTune() { arg.force.save(); } 
+    void postTune(){ arg.force.load(); } 
+#endif
 
     long long flops() const { return 0; }
-
-    long long bytes() const { return 0; }
+    long long bytes() const { return (16*arg.gauge.Bytes() + 8*arg.oprod.Bytes() + 4*arg.force.Bytes()) * arg.volumeCB; }
 
     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
   };

From fc84df57302b052d96113d3db95377537a005ae0 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 26 May 2015 22:18:02 -0700
Subject: [PATCH 4/7] Added autotuning support to clover trace computation.

---
 lib/clover_trace_quda.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/clover_trace_quda.cu b/lib/clover_trace_quda.cu
index e641d6295e..7e6331729f 100644
--- a/lib/clover_trace_quda.cu
+++ b/lib/clover_trace_quda.cu
@@ -219,9 +219,8 @@ namespace quda {
       void apply(const cudaStream_t &stream){
         if(location == QUDA_CUDA_FIELD_LOCATION){
 #if (__COMPUTE_CAPABILITY__ >= 200)
-          dim3 blockDim(128, 1, 1);
-          dim3 gridDim((arg.clover1.volumeCB + blockDim.x - 1)/blockDim.x, 1, 1);
-          cloverSigmaTraceKernel<Float,Clover1,Clover2,Gauge><<<gridDim,blockDim,0>>>(arg);
+	  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+          cloverSigmaTraceKernel<Float,Clover1,Clover2,Gauge><<<tp.grid,tp.block,0>>>(arg);
 #else
 	  errorQuda("cloverSigmaTrace not supported on pre-Fermi architecture");
 #endif
@@ -240,7 +239,7 @@ namespace quda {
       }
 
       long long flops() const { return 0; } // Fix this
-      long long bytes() const { return 0; } // Fix this
+      long long bytes() const { return (arg.clover1.Bytes() + arg.gauge.Bytes()) * arg.clover1.volumeCB; } 
 
     }; // CloverSigmaTrace
 
@@ -252,7 +251,6 @@ namespace quda {
     CloverTraceArg<Clover1, Clover2, Gauge> arg(clover1, clover2, gauge, dir1, dir2);
     CloverSigmaTrace<Float,Clover1,Clover2,Gauge> traceCompute(arg, meta, location);
     traceCompute.apply(0);
-    cudaDeviceSynchronize();
     return;
   }
 

From bd8a5ebe1ccf4d043cde7d850036895eef82297c Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Thu, 28 May 2015 00:40:37 -0700
Subject: [PATCH 5/7] Added qudaResidentExtendedGauge function to milc
 interface for creating an extended gauge field from the persistent gauge
 field.  Fixed inconsistent naming of sime QUDA interface functions and added
 some missing doxygen markup for these.

---
 include/quda.h                | 67 ++++++++++++++++++++++++++++-------
 include/quda_milc_interface.h |  2 ++
 lib/interface_quda.cpp        | 12 +++----
 lib/milc_interface.cpp        | 40 +++++++++++++++------
 4 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/include/quda.h b/include/quda.h
index e55faa4bfe..8c61f8c679 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -635,31 +635,74 @@ extern "C" {
 
 
   /**
-   * Take a gauge field on the host, extend it and load it onto the device. 
+   * Take a gauge field on the host, load it onto the device and extend it.
    * Return a pointer to the extended gauge field.
+   *
+   * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed)
+   * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
+   * @param param The parameters of the external field and the field to be created
+   * @return Pointer to the gauge field (cast as a void*)
    */
-  void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param);
-
-  void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param);
+  void* createExtendedGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param);
 
-  void  saveGaugeField(void* outGauge, void* inGauge, QudaGaugeParam* param);
+  /**
+   * Allocate a gauge (matrix) field on the device and optionally download a host gauge field.
+   *
+   * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed)
+   * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
+   * @param param The parameters of the external field and the field to be created
+   * @return Pointer to the gauge field (cast as a void*)
+   */
+  void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param);
 
-  void  extendGaugeField(void* outGauge, void* inGauge);
+  /**
+   * Store a gauge (matrix) field on the device and optionally download a CPU gauge field.
+   *
+   * @param outGauge Pointer to the host gauge field
+   * @param inGauge Pointer to the device gauge field
+   * @param param The parameters of the host and device fields
+   */
+  void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
 
+  /**
+   * Take a gauge field on the device and extend it
+   *
+   * @param outGauge Pointer to the output extended device gauge field
+   * @param inGauge Pointer to the input device gauge field
+   */
+  void  extendGaugeFieldQuda(void* outGauge, void* inGauge);
 
   /**
-   *  Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   *
+   * @param gauge Gauge field to be freed
    */
-  void destroyQudaGaugeField(void* gauge);
+  void destroyGaugeFieldQuda(void* gauge);
 
+  /**
+   * Compute the clover field and its inverse from the resident gauge field
+   *
+   * @param param The parameters of the clover field to create
+   */
   void createCloverQuda(QudaInvertParam* param);
 
+  /**
+   * Compute the sigma trace field (part of HMC)
+   * 
+   * @param out Sigma trace field 
+   * @param dummy (not used)
+   * @param mu mu direction
+   * @param nu nu direction
+   * @param dim array of local field dimensions
+   */
+  void computeCloverTraceQuda(void* out, void* dummy, int mu, int nu, int dim[4]);
 
-  void computeCloverTraceQuda(void* out, void* clover, int mu, int nu, int dim[4]);
-
+  /**
+   * Compute the derivative of the clover term
+   */
   void computeCloverDerivativeQuda(void* out, void* gauge, void* oprod, int mu, int nu,
-      double coeff,
-      QudaParity parity, QudaGaugeParam* param, int conjugate);
+				   double coeff,
+				   QudaParity parity, QudaGaugeParam* param, int conjugate);
 
   /**
    * Compute the quark-field outer product needed for gauge generation
diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h
index 85984e0ede..59fe27e6f9 100644
--- a/include/quda_milc_interface.h
+++ b/include/quda_milc_interface.h
@@ -235,6 +235,8 @@ extern "C" {
 
   void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision);
 
+  void* qudaResidentExtendedGaugeField(void* gauge, int geometry, int precision);
+
   void* qudaCreateGaugeField(void* gauge, int geometry, int precision);
 
   void qudaSaveGaugeField(void* gauge, void* inGauge);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 6c1702cea4..27939a557f 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -677,7 +677,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
   if (!initialized) errorQuda("QUDA not initialized");
 
   if (!h_clover && !h_clovinv) {
-    printfQuda("clover_coeff: %lf\n", inv_param->clover_coeff);
     if(inv_param->clover_coeff != 0){
       device_calc = true;
     }else{
@@ -3788,7 +3787,6 @@ void createCloverQuda(QudaInvertParam* invertParam)
   profileCloverCreate.Start(QUDA_PROFILE_TOTAL);
   profileCloverCreate.Start(QUDA_PROFILE_INIT);
   if(!cloverPrecise){
-    printfQuda("About to create cloverPrecise\n");
     CloverFieldParam cloverParam;
     cloverParam.nDim = 4;
     for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir];
@@ -3898,7 +3896,7 @@ void createCloverQuda(QudaInvertParam* invertParam)
   return;
 }
 
-void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
+void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 {
 
   GaugeFieldParam gParam(0,*param);
@@ -3928,7 +3926,7 @@ void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
 }
 
 
-void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){
+void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){
 
   cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
 
@@ -3946,7 +3944,7 @@ void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){
 }
 
 
-void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
+void* createExtendedGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 {
   profileExtendedGauge.Start(QUDA_PROFILE_TOTAL);
 
@@ -4028,7 +4026,7 @@ void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param)
 }
 
 // extend field on the GPU
-void extendGaugeField(void* out, void* in){
+void extendGaugeFieldQuda(void* out, void* in){
   cudaGaugeField* inGauge   = reinterpret_cast<cudaGaugeField*>(in);
   cudaGaugeField* outGauge  = reinterpret_cast<cudaGaugeField*>(out);
 
@@ -4042,7 +4040,7 @@ void extendGaugeField(void* out, void* in){
 
 
 
-void destroyQudaGaugeField(void* gauge){
+void destroyGaugeFieldQuda(void* gauge){
   cudaGaugeField* g = reinterpret_cast<cudaGaugeField*>(gauge);
   delete g;
 }
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index c815ebbbed..e595d5371a 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -878,14 +878,25 @@ void qudaInvert(int external_precision,
 
 #ifdef GPU_CLOVER_DIRAC
 
-void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision)
+static inline void* createExtendedGaugeField(void* gauge, int geometry, int precision, int resident)
 {
   qudamilc_called<true>(__func__);
   QudaPrecision qudaPrecision = (precision==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; 
   QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, qudaPrecision,
       (geometry==1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS);
+  gaugeParam.use_resident_gauge = resident ? 1 : 0;
+
+  return createExtendedGaugeFieldQuda(gauge, geometry, &gaugeParam);
+}
+
+void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision)
+{
+  return createExtendedGaugeField(gauge, geometry, precision, 0);
+}
 
-  return createExtendedGaugeField(gauge, geometry, &gaugeParam);
+void* qudaResidentExtendedGaugeField(void* gauge, int geometry, int precision)
+{
+  return createExtendedGaugeField(gauge, geometry, precision, 1);
 }
 
 
@@ -896,7 +907,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision)
   QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, qudaPrecision,
       (geometry==1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS);
 
-  return createGaugeField(gauge, geometry, &gaugeParam);
+  return createGaugeFieldQuda(gauge, geometry, &gaugeParam);
 }
 
 
@@ -905,7 +916,7 @@ void qudaSaveGaugeField(void* gauge, void* inGauge)
   qudamilc_called<true>(__func__);
   cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
   QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS);
-  saveGaugeField(gauge, inGauge, &gaugeParam);
+  saveGaugeFieldQuda(gauge, inGauge, &gaugeParam);
   qudamilc_called<false>(__func__);
   return;
 }
@@ -914,7 +925,7 @@ void qudaSaveGaugeField(void* gauge, void* inGauge)
 void qudaDestroyGaugeField(void* gauge)
 {
   qudamilc_called<true>(__func__);
-  destroyQudaGaugeField(gauge);
+  destroyGaugeFieldQuda(gauge);
     qudamilc_called<false>(__func__);
   return;
 }
@@ -994,7 +1005,7 @@ void setGaugeParams(QudaGaugeParam &gaugeParam, const int dim[4], QudaInvertArgs
 
 
 void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args, 
-    int external_precision, int quda_precision, double kappa) {
+		    int external_precision, int quda_precision, double kappa, double reliable_delta) {
 
   const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
   const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION;
@@ -1014,7 +1025,7 @@ void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args,
   invertParam.dagger                        = QUDA_DAG_NO;
   invertParam.mass_normalization            = QUDA_KAPPA_NORMALIZATION;
   invertParam.gcrNkrylov                    = 30; 
-  invertParam.reliable_delta                = 1e-1; 
+  invertParam.reliable_delta                = reliable_delta; 
   invertParam.maxiter                       = inv_args.max_iter;
 
   invertParam.cuda_prec_precondition        = device_precision_sloppy;
@@ -1069,7 +1080,7 @@ void qudaLoadCloverField(int external_precision,
     double *trlog) {
 
   QudaInvertParam invertParam = newQudaInvertParam();
-  setInvertParam(invertParam, inv_args, external_precision, quda_precision, 0.0);
+  setInvertParam(invertParam, inv_args, external_precision, quda_precision, 0.0, 0.0);
   invertParam.solution_type = solution_type;
   invertParam.solve_type = solve_type;
   invertParam.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC;
@@ -1132,8 +1143,10 @@ void qudaCloverInvert(int external_precision,
   qudaLoadCloverField(external_precision, quda_precision, inv_args, clover, cloverInverse,
       QUDA_MAT_SOLUTION, QUDA_DIRECT_PC_SOLVE, clover_coeff, 0, 0);
 
+  double reliable_delta = 1e-1;
+
   QudaInvertParam invertParam = newQudaInvertParam();
-  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa);
+  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
   invertParam.residual_type = static_cast<QudaResidualType_s>(0);
   invertParam.residual_type = (target_residual != 0) ? static_cast<QudaResidualType_s> ( invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : invertParam.residual_type;
   invertParam.residual_type = (target_fermilab_residual != 0) ? static_cast<QudaResidualType_s> (invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : invertParam.residual_type;
@@ -1186,8 +1199,10 @@ void qudaCloverMultishiftInvert(int external_precision,
     }
   }
 
+  // if doing a pure double-precision multi-shift solve don't use reliable updates
+  double reliable_delta = 1e-1;
   QudaInvertParam invertParam = newQudaInvertParam();
-  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa);
+  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
   invertParam.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
   invertParam.num_offset = num_offsets;
   for(int i=0; i<num_offsets; ++i){
@@ -1248,8 +1263,11 @@ void qudaCloverMultishiftMDInvert(int external_precision,
     }
   }
 
+  // if doing a pure double-precision multi-shift solve don't use reliable updates
+  double reliable_delta = (inv_args.mixed_precision == 1 || quda_precision == 1) ? 1e-1 : 0.0;
+
   QudaInvertParam invertParam = newQudaInvertParam();
-  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa);
+  setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
   invertParam.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
   invertParam.num_offset = num_offsets;
   for(int i=0; i<num_offsets; ++i){

From c53183230fe04feaedef6416759b648ef9bf7ff9 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Thu, 28 May 2015 00:41:43 -0700
Subject: [PATCH 6/7] When using the milc clover multi-shift solver interface,
 do not use reliable updates if doing a pure double precision solver.

---
 lib/milc_interface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index e595d5371a..ad89e51c2c 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -1200,7 +1200,7 @@ void qudaCloverMultishiftInvert(int external_precision,
   }
 
   // if doing a pure double-precision multi-shift solve don't use reliable updates
-  double reliable_delta = 1e-1;
+  double reliable_delta = (inv_args.mixed_precision == 1 || quda_precision == 1) ? 1e-1 : 0.0;
   QudaInvertParam invertParam = newQudaInvertParam();
   setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta);
   invertParam.residual_type = QUDA_L2_RELATIVE_RESIDUAL;

From e338930866c922485a0f736d8574d50adb9d5935 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Thu, 28 May 2015 10:50:24 -0700
Subject: [PATCH 7/7] Added doxygen comments for some milc interface functions
 and augmented documentation in quda.h

---
 include/quda.h                |  37 +++++++---
 include/quda_milc_interface.h | 129 ++++++++++++++++++++++++++++------
 2 files changed, 136 insertions(+), 30 deletions(-)

diff --git a/include/quda.h b/include/quda.h
index 8c61f8c679..d3b04ae250 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -636,7 +636,7 @@ extern "C" {
 
   /**
    * Take a gauge field on the host, load it onto the device and extend it.
-   * Return a pointer to the extended gauge field.
+   * Return a pointer to the extended gauge field object.
    *
    * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed)
    * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
@@ -656,19 +656,21 @@ extern "C" {
   void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param);
 
   /**
-   * Store a gauge (matrix) field on the device and optionally download a CPU gauge field.
+   * Copy the QUDA gauge (matrix) field on the device to the CPU
    *
    * @param outGauge Pointer to the host gauge field
-   * @param inGauge Pointer to the device gauge field
+   * @param inGauge Pointer to the device gauge field (QUDA device field)
    * @param param The parameters of the host and device fields
    */
   void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
 
   /**
-   * Take a gauge field on the device and extend it
+   * Take a gauge field on the device and copy to the extended gauge
+   * field.  The precisions and reconstruct types can differ between
+   * the input and output field, but they must be compatible (same volume, geometry).
    *
-   * @param outGauge Pointer to the output extended device gauge field
-   * @param inGauge Pointer to the input device gauge field
+   * @param outGauge Pointer to the output extended device gauge field (QUDA extended device field)
+   * @param inGauge Pointer to the input device gauge field (QUDA gauge field)
    */
   void  extendGaugeFieldQuda(void* outGauge, void* inGauge);
 
@@ -680,16 +682,19 @@ extern "C" {
   void destroyGaugeFieldQuda(void* gauge);
 
   /**
-   * Compute the clover field and its inverse from the resident gauge field
+   * Compute the clover field and its inverse from the resident gauge field.
    *
    * @param param The parameters of the clover field to create
    */
   void createCloverQuda(QudaInvertParam* param);
 
   /**
-   * Compute the sigma trace field (part of HMC)
+   * Compute the sigma trace field (part of clover force computation).
+   * All the pointers here are for QUDA native device objects.  The
+   * precisions of all fields must match.  This function requires that
+   * there is a persistent clover field.
    * 
-   * @param out Sigma trace field 
+   * @param out Sigma trace field  (QUDA device field, geometry = 1)
    * @param dummy (not used)
    * @param mu mu direction
    * @param nu nu direction
@@ -698,7 +703,19 @@ extern "C" {
   void computeCloverTraceQuda(void* out, void* dummy, int mu, int nu, int dim[4]);
 
   /**
-   * Compute the derivative of the clover term
+   * Compute the derivative of the clover term (part of clover force
+   * computation).  All the pointers here are for QUDA native device
+   * objects.  The precisions of all fields must match.
+   * 
+   * @param out Clover derivative field (QUDA device field, geometry = 1)
+   * @param gauge Gauge field (extended QUDA device field, gemoetry = 4)
+   * @param oprod Matrix field (outer product) which is multiplied by the derivative
+   * @param mu mu direction
+   * @param nu nu direction
+   * @param coeff Coefficient of the clover derviative (including stepsize and clover coefficient)
+   * @param parity Parity for which we are computing
+   * @param param Gauge field meta data
+   * @param conjugate Whether to make the oprod field anti-hermitian prior to multiplication
    */
   void computeCloverDerivativeQuda(void* out, void* gauge, void* oprod, int mu, int nu,
 				   double coeff,
diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h
index 59fe27e6f9..89294bace1 100644
--- a/include/quda_milc_interface.h
+++ b/include/quda_milc_interface.h
@@ -221,29 +221,118 @@ extern "C" {
       void* oprod[2]);
 
 
+  /**
+   * Evolve the gauge field by step size dt, using the momentum field
+   * I.e., Evalulate U(t+dt) = e(dt pi) U(t).  All fields are CPU fields in MILC order.
+   *
+   * @param precision Precision of the field (2 - double, 1 - single)
+   * @param dt The integration step size step
+   * @param momentum The momentum field
+   * @param The gauge field to be updated 
+   */
   void qudaUpdateU(int precision, 
-      double eps,
-      void* momentum, 
-      void* link);
-
-  void qudaCloverTrace(void* out, void* clover, int mu, int nu);
-
-
-  void qudaCloverDerivative(void* out, void* gauge, void* oprod, 
-      int mu, int nu, double coeff, int precision, int parity, int conjugate);
-
-
-  void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision);
-
-  void* qudaResidentExtendedGaugeField(void* gauge, int geometry, int precision);
-
-  void* qudaCreateGaugeField(void* gauge, int geometry, int precision);
-
-  void qudaSaveGaugeField(void* gauge, void* inGauge);
-
+		   double eps,
+		   void* momentum, 
+		   void* link);
+  
+  /**
+   * Compute the sigma trace field (part of clover force computation).
+   * All the pointers here are for QUDA native device objects.  The
+   * precisions of all fields must match.  This function requires that
+   * there is a persistent clover field.
+   * 
+   * @param out Sigma trace field  (QUDA device field, geometry = 1)
+   * @param dummy (not used)
+   * @param mu mu direction
+   * @param nu nu direction
+   */
+  void qudaCloverTrace(void* out,
+		       void* dummy,
+		       int mu,
+		       int nu);
+
+
+  /**
+   * Compute the derivative of the clover term (part of clover force
+   * computation).  All the pointers here are for QUDA native device
+   * objects.  The precisions of all fields must match.
+   * 
+   * @param out Clover derivative field (QUDA device field, geometry = 1)
+   * @param gauge Gauge field (extended QUDA device field, gemoetry = 4)
+   * @param oprod Matrix field (outer product) which is multiplied by the derivative
+   * @param mu mu direction
+   * @param nu nu direction
+   * @param coeff Coefficient of the clover derviative (including stepsize and clover coefficient)
+   * @param precision Precision of the fields (2 = double, 1 = single)
+   * @param parity Parity for which we are computing
+   * @param conjugate Whether to make the oprod field anti-hermitian prior to multiplication
+   */
+  void qudaCloverDerivative(void* out,
+			    void* gauge,
+			    void* oprod, 
+			    int mu,
+			    int nu,
+			    double coeff,
+			    int precision,
+			    int parity,
+			    int conjugate);
+
+
+  /**
+   * Take a gauge field on the host, load it onto the device and extend it.
+   * Return a pointer to the extended gauge field object.
+   *
+   * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed)
+   * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
+   * @param precision The precision of the fields (2 - double, 1 - single)
+   * @return Pointer to the gauge field (cast as a void*)
+   */
+  void* qudaCreateExtendedGaugeField(void* gauge,
+				     int geometry,
+				     int precision);
+
+  /**
+   * Take the QUDA resident gauge field and extend it.
+   * Return a pointer to the extended gauge field object.
+   *
+   * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed)
+   * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
+   * @param precision The precision of the fields (2 - double, 1 - single)
+   * @return Pointer to the gauge field (cast as a void*)
+   */
+  void* qudaResidentExtendedGaugeField(void* gauge,
+				       int geometry,
+				       int precision);
+
+  /**
+   * Allocate a gauge (matrix) field on the device and optionally download a host gauge field.
+   *
+   * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed)
+   * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor)
+   * @param precision The precision of the field to be created (2 - double, 1 - single)
+   * @return Pointer to the gauge field (cast as a void*)
+   */
+  void* qudaCreateGaugeField(void* gauge,
+			     int geometry,
+			     int precision);
+
+  /**
+   * Copy the QUDA gauge (matrix) field on the device to the CPU
+   *
+   * @param outGauge Pointer to the host gauge field
+   * @param inGauge Pointer to the device gauge field (QUDA device field)
+   */
+  void qudaSaveGaugeField(void* gauge,
+			  void* inGauge);
+
+  /**
+   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   *
+   * @param gauge Gauge field to be freed
+   */
   void qudaDestroyGaugeField(void* gauge);
 
-
+  
 #ifdef __cplusplus
 }
 #endif