diff --git a/include/clover_field.h b/include/clover_field.h
index 6f16c99154..1c1f2067f4 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -25,6 +25,16 @@ namespace quda {
       order = (precision == QUDA_DOUBLE_PRECISION) ? 
 	QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER;
     }
+
+    CloverFieldParam() :  LatticeFieldParam(),
+      direct(true), inverse(true), clover(nullptr), norm(nullptr),
+      cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { }
+
+    CloverFieldParam(const CloverFieldParam &param) :  LatticeFieldParam(param),
+      direct(param.direct), inverse(param.inverse),
+      clover(param.clover), norm(param.norm),
+      cloverInv(param.cloverInv), invNorm(param.invNorm),
+      twisted(param.twisted), mu2(param.mu2) { }
   };
 
   std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param);
diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index 73a3112398..ceb08ad5c2 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -144,7 +144,7 @@ namespace quda {
           complex<Float> tmp(a[parity][idx], a[parity][idx+1]);
           return tmp;
 	} else {
-	  // requesting upper triangular so return conjuate transpose
+	  // requesting upper triangular so return conjugate transpose
 	  return conj(operator()(parity,x,s_col,s_row,c_col,c_row) );
 	}
       }
@@ -252,65 +252,143 @@ namespace quda {
     template <typename Float, int length, int N>
       struct FloatNOrder {
 	typedef typename mapper<Float>::type RegType;
-	Float *clover[2];
-	float *norm[2];
+	typedef typename VectorType<Float,N>::type Vector;
+	static const int M=length/(N*2); // number of short vectors per chiral block
+	static const int block=length/2; // chiral block size
+
+	Float *clover;
+	float *norm;
+	size_t offset;
+	size_t norm_offset;
 	const int volumeCB;
 	const int stride;
 
 	const bool twisted;
 	const Float mu2;
-	
-      FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()),
-	  twisted(clover.Twisted()), mu2(clover.Mu2()) {
-	this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse));
-	this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2);
-	this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse));
-	this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2);
-      }
+
+	size_t bytes;
+	size_t norm_bytes;
+	void *backup_h; //! host memory for backing up the field when tuning
+	void *backup_norm_h; //! host memory for backing up norm when tuning
+
+        FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) :
+	  offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))),
+	  volumeCB(clover.VolumeCB()), stride(clover.Stride()),
+	  twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()),
+	  norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr)
+	{
+	  this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse));
+	  this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse));
+	}
       
 	bool  Twisted()	const	{return twisted;}
 	Float Mu2()	const	{return mu2;}
 	
-	__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
-	  const int M=length/(N*2);
-	  for (int chirality=0; chirality<2; chirality++) {
-	    for (int i=0; i<M; i++) {
-	      for (int j=0; j<N; j++) {
-		int intIdx = (chirality*M + i)*N + j; // internal dof index
-		int padIdx = intIdx / N;
-		copy(v[(chirality*M+i)*N+j], clover[parity][(padIdx*stride + x)*N + intIdx%N]);
-		if (sizeof(Float)==sizeof(short)) v[(chirality*M+i)*N+j] *= norm[parity][chirality*stride + x];
-	      }
-	    }
+	/**
+	   @brief Load accessor for a single chiral block
+	   @param[out] v Vector of loaded elements
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void load(RegType v[block], int x, int parity, int chirality) const {
+#pragma unroll
+	  for (int i=0; i<M; i++) {
+	    // first do vectorized copy from memory
+	    Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
+	    // second do scalar copy converting into register type
+#pragma unroll
+	    for (int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
 	  }
+
+	  if (sizeof(Float)==sizeof(short))
+#pragma unroll
+	    for (int i=0; i<block; i++) v[i] *= norm[parity*norm_offset + chirality*stride + x];
 	}
   
-	__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
+	/**
+	   @brief Store accessor for a single chiral block
+	   @param[out] v Vector of elements to be stored
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void save(const RegType v[block], int x, int parity, int chirality) {
 	  // find the norm of each chiral block
-	  RegType scale[2];
+	  RegType scale = 0.0;
 	  if (sizeof(Float)==sizeof(short)) {
-	    const int M = length/2;
-	    for (int chi=0; chi<2; chi++) { // chirality
-	      scale[chi] = 0.0;
-	      for (int i=0; i<M; i++) 
-		scale[chi] = fabs(v[chi*M+i]) > scale[chi] ? fabs(v[chi*M+i]) : scale[chi];
-	      norm[parity][chi*stride + x] = scale[chi];
-	    }
+#pragma unroll
+	    for (int i=0; i<block; i++) scale = fabs(v[i]) > scale ? fabs(v[i]) : scale;
+	    norm[parity*norm_offset + chirality*stride + x] = scale;
 	  }
 
-	  const int M=length/(N*2);
-	  for (int chirality=0; chirality<2; chirality++) {
-	    for (int i=0; i<M; i++) {
-	      for (int j=0; j<N; j++) {
-		int intIdx = (chirality*M + i)*N + j;
-		int padIdx = intIdx / N;
-		if (sizeof(Float)==sizeof(short))
-		  copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j] / scale[chirality]);
-		else
-		  copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j]);
-	      }
-	    }
+#pragma unroll
+	  for (int i=0; i<M; i++) {
+	    Vector vecTmp;
+	    // first do scalar copy converting into storage type and rescaling if necessary
+	    if (sizeof(Float)==sizeof(short))
+#pragma unroll
+	      for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j] / scale);
+	    else
+#pragma unroll
+	      for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j]);
+
+	    // second do vectorized copy into memory
+	    reinterpret_cast<Vector*>(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp;
+	  }
+	}
+
+	/**
+	   @brief Load accessor for the clover matrix
+	   @param[out] v Vector of loaded elements
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
+#pragma unroll
+	  for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality);
+	}
+
+	/**
+	   @brief Store accessor for the clover matrix
+	   @param[out] v Vector of elements to be stored
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
+#pragma unroll
+	  for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality);
+	}
+
+	/**
+	   @brief Backup the field to the host when tuning
+	*/
+	void save() {
+	  if (backup_h) errorQuda("Already allocated host backup");
+	  backup_h = safe_malloc(bytes);
+	  cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
+	  if (norm_bytes) {
+	    backup_norm_h = safe_malloc(norm_bytes);
+	    cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
+	  }
+	  checkCudaError();
+	}
+
+	/**
+	   @brief Restore the field from the host after tuning
+	*/
+	void load() {
+	  cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
+	  host_free(backup_h);
+	  backup_h = nullptr;
+	  if (norm_bytes) {
+	    cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
+	    host_free(backup_norm_h);
+	    backup_norm_h = nullptr;
 	  }
+	  checkCudaError();
 	}
 
 	size_t Bytes() const {
@@ -320,6 +398,7 @@ namespace quda {
 	}
       };
 
+
     /**
        QDP ordering for clover fields
     */
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index c62502b8ae..9488f4ca96 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -33,7 +33,6 @@ namespace quda {
     cudaGaugeField *fatGauge;  // used by staggered only
     cudaGaugeField *longGauge; // used by staggered only
     cudaCloverField *clover;
-    cudaCloverField *cloverInv;
   
     double mu; // used by twisted mass only
     double epsilon; //2nd tm parameter (used by twisted mass only)
@@ -49,7 +48,7 @@ namespace quda {
 
   DiracParam() 
     : type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID),
-      dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0),
+      dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0),
       tmp1(0), tmp2(0)
     {
 
@@ -537,7 +536,6 @@ namespace quda {
     double mu;
     double epsilon;
     cudaCloverField &clover;
-    cudaCloverField &cloverInv;
     void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const;
     void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in, 
           const QudaTwistGamma5Type twistType, const int parity) const;
diff --git a/include/multigrid.h b/include/multigrid.h
index 682442e6e8..a8e1eb95d6 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -329,7 +329,6 @@ namespace quda {
      @param T[in] Transfer operator that defines the coarse space
      @param gauge[in] Gauge field from fine grid
      @param clover[in] Clover field on fine grid (optional)
-     @param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover)
      @param kappa[in] Kappa parameter
      @param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover)
      @param matpc[in] The type of even-odd preconditioned fine-grid
@@ -338,7 +337,7 @@ namespace quda {
      even-odd preconditioned and we coarsen the full operator.
    */
   void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T,
-		const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv,
+		const cudaGaugeField &gauge, const cudaCloverField *clover,
 		double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc);
 
   /**
diff --git a/include/quda.h b/include/quda.h
index 7b22576d35..1b38429d14 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -183,6 +183,11 @@ extern "C" {
     int compute_clover_trlog;              /**< Whether to compute the trace log of the clover term */
     double trlogA[2];                      /**< The trace log of the clover term (even/odd computed separately) */
 
+    int compute_clover;                    /**< Whether to compute the clover field */
+    int compute_clover_inverse;            /**< Whether to compute the clover inverse field */
+    int return_clover;                     /**< Whether to copy back the clover matrix field */
+    int return_clover_inverse;             /**< Whether to copy back the inverted clover matrix field */
+
     QudaVerbosity verbosity;               /**< The verbosity setting to use in the solver */
 
     int sp_pad;                            /**< The padding to use for the fermion fields */
diff --git a/lib/check_params.h b/lib/check_params.h
index a10b3d63cb..d04220a5a2 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -355,9 +355,18 @@ void printQudaInvertParam(QudaInvertParam *param) {
 #if defined INIT_PARAM
     P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION);
     P(compute_clover_trlog, 0);
+    P(compute_clover, 0);
+    P(compute_clover_inverse, 0);
+    P(return_clover, 0);
+    P(return_clover_inverse, 0);
 #else
-  if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
-    param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
+    if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
+      param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
+    P(compute_clover_trlog, QUDA_INVALID_PRECISION);
+    P(compute_clover, QUDA_INVALID_PRECISION);
+    P(compute_clover_inverse, QUDA_INVALID_PRECISION);
+    P(return_clover, QUDA_INVALID_PRECISION);
+    P(return_clover_inverse, QUDA_INVALID_PRECISION);
 #endif
     P(clover_order, QUDA_INVALID_CLOVER_ORDER);
     P(cl_pad, INVALID_INT);
diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu
index 2764a84e2f..0142c493af 100644
--- a/lib/clover_invert.cu
+++ b/lib/clover_invert.cu
@@ -4,7 +4,7 @@
 #include <launch_kernel.cuh>
 #include <face_quda.h>
 #include <atomic.cuh>
-#include <cub/cub.cuh> 
+#include <cub_helper.cuh>
 
 namespace quda {
 
@@ -13,35 +13,31 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
 
   template <typename Clover>
-  struct CloverInvertArg {
+  struct CloverInvertArg : public ReduceArg<double2> {
     const Clover clover;
     Clover inverse;
     bool computeTraceLog;
-    double * const trlogA_h;
-    double *trlogA_d;
 //extra attributes for twisted mass clover
     bool twist;
     double mu2;
-    CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0, double* const trlogA=0) :
-      inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), trlogA_h(trlogA), twist(clover.Twisted()), mu2(clover.Mu2()){
-      cudaHostGetDevicePointer(&trlogA_d, trlogA_h, 0); // set the matching device pointer
-    }
+    CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0) :
+      ReduceArg<double2>(), inverse(inverse), clover(clover), computeTraceLog(computeTraceLog),
+      twist(clover.Twisted()), mu2(clover.Mu2()) { }
   };
 
   /**
      Use a Cholesky decomposition to invert the clover matrix
      Here we use an inplace inversion which hopefully reduces register pressure
    */
-  template <int blockSize, typename Float, typename Clover>
-  __device__ __host__ double cloverInvertCompute(CloverInvertArg<Clover> arg, int x, int parity) {
-
-    Float A[72];
-    double trlogA = 0.0; 
+  template <int blockSize, typename Float, typename Clover, bool computeTrLog, bool twist>
+  __device__ __host__ inline double cloverInvertCompute(CloverInvertArg<Clover> arg, int x, int parity) {
 
-    // load the clover term into memory
-    arg.clover.load(A, x, parity);
+    double trlogA = 0.0;
 
     for (int ch=0; ch<2; ch++) {
+      Float A[36];
+      // load the clover term into memory
+      arg.clover.load(A, x, parity, ch);
 
       Float diag[6];
       Float tmp[6]; // temporary storage
@@ -50,51 +46,36 @@ namespace quda {
       // hack into the right order as MILC just to copy algorithm directly
       // FIXME use native ordering in the Cholseky 
       // factor of two is inherent to QUDA clover storage
-      for (int i=0; i<6; i++) diag[i] = 2.0*A[ch*36+i];
+      constexpr Float two = static_cast<Float>(2.0);
+      for (int i=0; i<6; i++) diag[i] = two*A[ch*36+i];
 
       const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
-      for (int i=0; i<15; i++) tri[idtab[i]] = complex<Float>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);
+#pragma unroll
+      for (int i=0; i<15; i++) tri[idtab[i]] = complex<Float>(two*A[ch*36+6+2*i], two*A[ch*36+6+2*i+1]);
 
-//Compute (T^2 + mu2) first, then invert (not optimized!):
-      if(arg.twist)
-      {
+      //Compute (T^2 + mu2) first, then invert (not optimized!):
+      if (twist) {
          complex<Float> aux[15];//hmmm, better to reuse A-regs...
          //another solution just to define (but compiler may not be happy with this, swapping everything in
          //the global buffer):
          //complex<Float>* aux = (complex<Float>*)&A[ch*36];
          //compute off-diagonal terms:
-//
          aux[ 0] = tri[0]*diag[0]+diag[1]*tri[0]+conj(tri[2])*tri[1]+conj(tri[4])*tri[3]+conj(tri[7])*tri[6]+conj(tri[11])*tri[10];
-//
          aux[ 1] = tri[1]*diag[0]+diag[2]*tri[1]+tri[2]*tri[0]+conj(tri[5])*tri[3]+conj(tri[8])*tri[6]+conj(tri[12])*tri[10];
-
          aux[ 2] = tri[2]*diag[1]+diag[2]*tri[2]+tri[1]*conj(tri[0])+conj(tri[5])*tri[4]+conj(tri[8])*tri[7]+conj(tri[12])*tri[11];
-//
          aux[ 3] = tri[3]*diag[0]+diag[3]*tri[3]+tri[4]*tri[0]+tri[5]*tri[1]+conj(tri[9])*tri[6]+conj(tri[13])*tri[10];
-
          aux[ 4] = tri[4]*diag[1]+diag[3]*tri[4]+tri[3]*conj(tri[0])+tri[5]*tri[2]+conj(tri[9])*tri[7]+conj(tri[13])*tri[11];
-
          aux[ 5] = tri[5]*diag[2]+diag[3]*tri[5]+tri[3]*conj(tri[1])+tri[4]*conj(tri[2])+conj(tri[9])*tri[8]+conj(tri[13])*tri[12];
-//
          aux[ 6] = tri[6]*diag[0]+diag[4]*tri[6]+tri[7]*tri[0]+tri[8]*tri[1]+tri[9]*tri[3]+conj(tri[14])*tri[10];
-
          aux[ 7] = tri[7]*diag[1]+diag[4]*tri[7]+tri[6]*conj(tri[0])+tri[8]*tri[2]+tri[9]*tri[4]+conj(tri[14])*tri[11];
-
          aux[ 8] = tri[8]*diag[2]+diag[4]*tri[8]+tri[6]*conj(tri[1])+tri[7]*conj(tri[2])+tri[9]*tri[5]+conj(tri[14])*tri[12];
-
          aux[ 9] = tri[9]*diag[3]+diag[4]*tri[9]+tri[6]*conj(tri[3])+tri[7]*conj(tri[4])+tri[8]*conj(tri[5])+conj(tri[14])*tri[13];
-//
          aux[10] = tri[10]*diag[0]+diag[5]*tri[10]+tri[11]*tri[0]+tri[12]*tri[1]+tri[13]*tri[3]+tri[14]*tri[6];
-
          aux[11] = tri[11]*diag[1]+diag[5]*tri[11]+tri[10]*conj(tri[0])+tri[12]*tri[2]+tri[13]*tri[4]+tri[14]*tri[7];
-
          aux[12] = tri[12]*diag[2]+diag[5]*tri[12]+tri[10]*conj(tri[1])+tri[11]*conj(tri[2])+tri[13]*tri[5]+tri[14]*tri[8];
-
          aux[13] = tri[13]*diag[3]+diag[5]*tri[13]+tri[10]*conj(tri[3])+tri[11]*conj(tri[4])+tri[12]*conj(tri[5])+tri[14]*tri[9];
-
          aux[14] = tri[14]*diag[4]+diag[5]*tri[14]+tri[10]*conj(tri[6])+tri[11]*conj(tri[7])+tri[12]*conj(tri[8])+tri[13]*conj(tri[9]);
 
-
          //update diagonal elements:
          diag[0] = (Float)arg.mu2+diag[0]*diag[0]+norm(tri[ 0])+norm(tri[ 1])+norm(tri[ 3])+norm(tri[ 6])+norm(tri[10]);
          diag[1] = (Float)arg.mu2+diag[1]*diag[1]+norm(tri[ 0])+norm(tri[ 2])+norm(tri[ 4])+norm(tri[ 7])+norm(tri[11]); 
@@ -103,10 +84,10 @@ namespace quda {
          diag[4] = (Float)arg.mu2+diag[4]*diag[4]+norm(tri[ 6])+norm(tri[ 7])+norm(tri[ 8])+norm(tri[ 9])+norm(tri[14]);
          diag[5] = (Float)arg.mu2+diag[5]*diag[5]+norm(tri[10])+norm(tri[11])+norm(tri[12])+norm(tri[13])+norm(tri[14]);
 
-        //update off-diagonal elements:
+	 //update off-diagonal elements:
          for(int i = 0; i < 15; i++) tri[i] = aux[i];
       }
-//
+
       for (int j=0; j<6; j++) {
 	diag[j] = sqrt(diag[j]);
 	tmp[j] = 1.0 / diag[j];
@@ -128,7 +109,7 @@ namespace quda {
       }
       
       /* Accumulate trlogA */
-      for (int j=0;j<6;j++) trlogA += (double)2.0*log((double)(diag[j]));
+      if (computeTrLog) for (int j=0;j<6;j++) trlogA += 2.0*log((double)(diag[j]));
 
       /* Now use forward and backward substitution to construct inverse */
       complex<Float> v1[6];
@@ -165,79 +146,94 @@ namespace quda {
 	}
       }
 
-      for (int i=0; i<6; i++) A[ch*36+i] = 0.5 * diag[i];
-      for (int i=0; i<15; i++) {
-	A[ch*36+6+2*i] = 0.5*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = 0.5*tri[idtab[i]].imag();
-      }
-    }	     
+      constexpr Float half = static_cast<Float>(0.5);
+      for (int i=0; i<6; i++) A[ch*36+i] = half * diag[i];
+#pragma unroll
+      for (int i=0; i<15; i++) { A[ch*36+6+2*i] = half*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = half*tri[idtab[i]].imag(); }
 
-    // save the inverted matrix
-    arg.inverse.save(A, x, parity);
+      // save the inverted matrix
+      arg.inverse.save(A, x, parity, ch);
+    }
 
     return trlogA;
   }
 
-  template <int blockSize, typename Float, typename Clover>
+  template <int blockSize, typename Float, typename Clover, bool computeTrLog, bool twist>
   void cloverInvert(CloverInvertArg<Clover> arg) {  
     for (int parity=0; parity<2; parity++) {
       for (int x=0; x<arg.clover.volumeCB; x++) {
 	// should make this thread safe if we ever apply threads to cpu code
-	double trlogA = cloverInvertCompute<blockSize, Float>(arg, x, parity);
-	if (arg.computeTraceLog) arg.trlogA_h[parity] += trlogA;
+	double trlogA = cloverInvertCompute<blockSize,Float,Clover,computeTrLog,twist>(arg, x, parity);
+	if (computeTrLog) {
+	  if (parity) arg.result_h[0].y += trlogA;
+	  else arg.result_h[0].x += trlogA;
+	}
       }
     }
   }
 
-  template <int blockSize, typename Float, typename Clover>
+  template <int blockSize, typename Float, typename Clover, bool computeTrLog, bool twist>
+  __launch_bounds__(2*blockSize)
   __global__ void cloverInvertKernel(CloverInvertArg<Clover> arg) {  
     int idx = blockIdx.x*blockDim.x + threadIdx.x;
-    //if (idx >= arg.clover.volumeCB) return;
-    int parity = blockIdx.y;
-    double trlogA = 0.0;
-    if (idx < arg.clover.volumeCB) trlogA = cloverInvertCompute<blockSize, Float>(arg, idx, parity);
-
-    if (arg.computeTraceLog) {
-      typedef cub::BlockReduce<double, blockSize> BlockReduce;
-      __shared__ typename BlockReduce::TempStorage temp_storage;
-      double aggregate = BlockReduce(temp_storage).Sum(trlogA);
-      if (threadIdx.x == 0) atomicAdd(arg.trlogA_d+parity, aggregate);
-    }
+    int parity = threadIdx.y;
+    double trlogA_parity = 0.0;
+    if (idx < arg.clover.volumeCB)
+      trlogA_parity = cloverInvertCompute<blockSize,Float,Clover,computeTrLog,twist>(arg, idx, parity);
+    double2 trlogA = parity ? make_double2(0.0,trlogA_parity) : make_double2(trlogA_parity, 0.0);
 
+    if (computeTrLog) reduce2d<blockSize,2>(arg, trlogA);
   }
 
   template <typename Float, typename Clover>
-  class CloverInvert : Tunable {
+  class CloverInvert : TunableLocalParity {
     CloverInvertArg<Clover> arg;
     const CloverField &meta; // used for meta data only
     const QudaFieldLocation location;
 
   private:
-    unsigned int sharedBytesPerThread() const { return 0; }
-    unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
-
     bool tuneSharedBytes() const { return false; } // Don't tune the shared memory
-    bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
     unsigned int minThreads() const { return arg.clover.volumeCB; }
 
   public:
     CloverInvert(CloverInvertArg<Clover> &arg, const CloverField &meta, QudaFieldLocation location) 
       : arg(arg), meta(meta), location(location) { 
-      writeAuxString("stride=%d,prec=%lu",arg.clover.stride,sizeof(Float));
+      writeAuxString("stride=%d,prec=%lu,trlog=%s,twist=%s", arg.clover.stride, sizeof(Float),
+		     arg.computeTraceLog ? "true" : "false", arg.twist ? "true" : "false");
     }
     virtual ~CloverInvert() { ; }
   
     void apply(const cudaStream_t &stream) {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      arg.trlogA_h[0] = 0.0; arg.trlogA_h[1] = 0.0;
+      arg.result_h[0] = make_double2(0.,0.);
       if (location == QUDA_CUDA_FIELD_LOCATION) {
-	tp.grid.y = 2; // for parity
-	LAUNCH_KERNEL(cloverInvertKernel, tp, stream, arg, Float, Clover);
+	if (arg.computeTraceLog) {
+	  if (arg.twist) {
+	    LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, true);
+	  } else {
+	    LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, false);
+	  }
+	} else {
+	  if (arg.twist) {
+	    LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, true);
+	  } else {
+	    LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, false);
+	  }
+	}
       } else {
-	cloverInvert<1, Float, Clover>(arg);
-      }
-      if (arg.computeTraceLog) {
-	cudaDeviceSynchronize();
-	reduceDoubleArray(arg.trlogA_h, 2);
+	if (arg.computeTraceLog) {
+	  if (arg.twist) {
+	    cloverInvert<1, Float, Clover, true, true>(arg);
+	  } else {
+	    cloverInvert<1, Float, Clover, true, false>(arg);
+	  }
+	} else {
+	  if (arg.twist) {
+	    cloverInvert<1, Float, Clover, false, true>(arg);
+	  } else {
+	    cloverInvert<1, Float, Clover, false, false>(arg);
+	  }
+	}
       }
     }
 
@@ -254,15 +250,25 @@ namespace quda {
 
     long long flops() const { return 0; } 
     long long bytes() const { return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); } 
+
+    void preTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.save(); }
+    void postTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.load(); }
+
   };
 
   template <typename Float, typename Clover>
   void cloverInvert(Clover inverse, const Clover clover, bool computeTraceLog, 
 		    double* const trlog, const CloverField &meta, QudaFieldLocation location) {
-    CloverInvertArg<Clover> arg(inverse, clover, computeTraceLog, trlog);
+    CloverInvertArg<Clover> arg(inverse, clover, computeTraceLog);
     CloverInvert<Float,Clover> invert(arg, meta, location);
     invert.apply(0);
-    cudaDeviceSynchronize();
+
+    if (arg.computeTraceLog) {
+      cudaDeviceSynchronize();
+      comm_allreduce_array((double*)arg.result_h, 2);
+      trlog[0] = arg.result_h[0].x;
+      trlog[1] = arg.result_h[0].y;
+    }
   }
 
   template <typename Float>
diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu
index 17fb161dec..4814d057f1 100644
--- a/lib/clover_quda.cu
+++ b/lib/clover_quda.cu
@@ -95,7 +95,6 @@ namespace quda {
       const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
       Float diag[6];
       Complex triangle[15];
-      Float A[72];
 
       // This uses lots of unnecessary memory
       for(int ch=0; ch<2; ++ch){ 
@@ -129,18 +128,17 @@ namespace quda {
         triangle[14] =   block1[ch](2,1);
 
 
-        for(int i=0; i<6; ++i){
-          A[ch*36 + i] = 0.5*diag[i];
-        } 
+	Float A[36];
+        for(int i=0; i<6; ++i) A[i] = static_cast<Float>(0.5)*diag[i];
+
         for(int i=0; i<15; ++i){
-          A[ch*36+6+2*i]     = 0.5*triangle[idtab[i]].x;
-          A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y;
+          A[6+2*i]     = 0.5*triangle[idtab[i]].x;
+          A[6+2*i + 1] = 0.5*triangle[idtab[i]].y;
         } 
+	arg.clover.save(A, idx, parity, ch);
       } // ch
       // 84 floating-point ops
 
-
-      arg.clover.save(A, idx, parity);
       return;
     }
 
diff --git a/lib/coarse_op.cu b/lib/coarse_op.cu
index 1dc9c3f0ed..5a0e28b24d 100644
--- a/lib/coarse_op.cu
+++ b/lib/coarse_op.cu
@@ -15,7 +15,7 @@ namespace quda {
   template <typename Float, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder, QudaCloverFieldOrder clOrder,
             int fineColor, int fineSpin, int coarseColor, int coarseSpin>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
 
     typedef typename colorspinor::FieldOrderCB<Float,fineSpin,fineColor,coarseColor,csOrder> F;
     typedef typename gauge::FieldOrder<Float,fineColor,1,gOrder> gFine;
@@ -34,7 +34,7 @@ namespace quda {
     gCoarse xAccessor(const_cast<GaugeField&>(X));
     gCoarse xInvAccessor(const_cast<GaugeField&>(Xinv));
     cFine cAccessor(const_cast<CloverField&>(c), false);
-    cFine cInvAccessor(const_cast<CloverField&>(cI), true);
+    cFine cInvAccessor(const_cast<CloverField&>(c), true);
 
     calculateY<false,Float,fineSpin,fineColor,coarseSpin,coarseColor,gOrder>
       (yAccessor, xAccessor, xInvAccessor, uvAccessor, avAccessor, vAccessor, gAccessor, cAccessor, cInvAccessor, Y, X, Xinv, Yhat, av, v, kappa, mu, dirac, matpc);
@@ -44,28 +44,28 @@ namespace quda {
   template <typename Float, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder, QudaCloverFieldOrder clOrder,
 	    int fineColor, int fineSpin>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (T.Vectors().Nspin()/T.Spin_bs() != 2)
       errorQuda("Unsupported number of coarse spins %d\n",T.Vectors().Nspin()/T.Spin_bs());
     const int coarseSpin = 2;
     const int coarseColor = Y.Ncolor() / coarseSpin;
 
     if (coarseColor == 2) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,2,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,2,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 4) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,4,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,4,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 8) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,8,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,8,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 12) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,12,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,12,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 16) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,16,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,16,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 20) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,20,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,20,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 24) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,24,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,24,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else if (coarseColor == 32) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,32,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,fineSpin,32,coarseSpin>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor());
     }
@@ -74,9 +74,9 @@ namespace quda {
   // template on fine spin
   template <typename Float, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder, QudaCloverFieldOrder clOrder, int fineColor>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (uv.Nspin() == 4) {
-      calculateY<Float,csOrder,gOrder,clOrder,fineColor,4>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,fineColor,4>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported number of spins %d\n", uv.Nspin());
     }
@@ -85,9 +85,9 @@ namespace quda {
   // template on fine colors
   template <typename Float, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder, QudaCloverFieldOrder clOrder>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (g.Ncolor() == 3) {
-      calculateY<Float,csOrder,gOrder,clOrder,3>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,clOrder,3>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported number of colors %d\n", g.Ncolor());
     }
@@ -95,10 +95,10 @@ namespace quda {
 
   template <typename Float, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     //If c == NULL, then this is standard Wilson.  csOrder is dummy and will not matter      
     if (c.Order() == QUDA_PACKED_CLOVER_ORDER) {
-      calculateY<Float,csOrder,gOrder,QUDA_PACKED_CLOVER_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,gOrder,QUDA_PACKED_CLOVER_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported field order %d\n", c.Order());
     }
@@ -106,9 +106,9 @@ namespace quda {
 
   template <typename Float, QudaFieldOrder csOrder>
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (g.FieldOrder() == QUDA_QDP_GAUGE_ORDER) {
-      calculateY<Float,csOrder,QUDA_QDP_GAUGE_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,csOrder,QUDA_QDP_GAUGE_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported field order %d\n", g.FieldOrder());
     }
@@ -116,9 +116,9 @@ namespace quda {
 
  template <typename Float>
  void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		 const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		 const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (T.Vectors().FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
-      calculateY<Float,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<Float,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported field order %d\n", T.Vectors().FieldOrder());
     }
@@ -126,7 +126,7 @@ namespace quda {
 
   //Does the heavy lifting of creating the coarse color matrices Y
   void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T,
-		  const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
+		  const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     if (X.Precision() != Y.Precision() || Y.Precision() != uv.Precision() ||
         Y.Precision() != T.Vectors().Precision() || Y.Precision() != g.Precision())
       errorQuda("Unsupported precision mix");
@@ -135,12 +135,12 @@ namespace quda {
 
     if (Y.Precision() == QUDA_DOUBLE_PRECISION) {
 #ifdef GPU_MULTIGRID_DOUBLE
-      calculateY<double>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<double>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
 #else
       errorQuda("Double precision multigrid has not been enabled");
 #endif
     } else if (Y.Precision() == QUDA_SINGLE_PRECISION) {
-      calculateY<float>(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc);
+      calculateY<float>(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc);
     } else {
       errorQuda("Unsupported precision %d\n", Y.Precision());
     }
@@ -150,7 +150,7 @@ namespace quda {
   //Calculates the coarse color matrix and puts the result in Y.
   //N.B. Assumes Y, X have been allocated.
   void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T,
-		const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv,
+		const cudaGaugeField &gauge, const cudaCloverField *clover,
 		double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) {
     QudaPrecision precision = Y.Precision();
     //First make a cpu gauge field from the cuda gauge field
@@ -203,19 +203,9 @@ namespace quda {
     cf_param.create = QUDA_NULL_FIELD_CREATE;
     cf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
 
-    if (cloverInv && (dirac == QUDA_TWISTED_CLOVERPC_DIRAC)) {
-      cf_param.direct = false;
-      cpuCloverField cI(cf_param);
-      cloverInv->saveCPUField(cI);
-      cf_param.direct = true;
-      cpuCloverField c(cf_param);
-      clover->saveCPUField(c);
-      calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, cI, kappa, mu, dirac, matpc);
-    } else {
-      cpuCloverField c(cf_param);
-      if (clover) clover->saveCPUField(c);
-      calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, c, kappa, mu, dirac, matpc);
-    }
+    cpuCloverField c(cf_param);
+    if (clover) clover->saveCPUField(c);
+    calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, kappa, mu, dirac, matpc);
 
     if (&T.Vectors() != av) delete av;
     delete uv;
diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp
index 76f4a45d04..6d12fa2bc8 100644
--- a/lib/dirac_clover.cpp
+++ b/lib/dirac_clover.cpp
@@ -158,8 +158,7 @@ namespace quda {
   }
 
   void DiracClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
-    cudaCloverField *cInv = NULL;
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID);
   }
 
   DiracCloverPC::DiracCloverPC(const DiracParam &param) : 
@@ -379,8 +378,7 @@ namespace quda {
   }
 
   void DiracCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
-    cudaCloverField *cInv = NULL;
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType);
   }
 
 } // namespace quda
diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp
index f0ac9a0887..4f4bfc9477 100644
--- a/lib/dirac_twisted_clover.cpp
+++ b/lib/dirac_twisted_clover.cpp
@@ -14,14 +14,14 @@ namespace quda {
   }
 
   DiracTwistedClover::DiracTwistedClover(const DiracParam &param, const int nDim) 
-    : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover)), cloverInv(*(param.cloverInv))
+    : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover))
   {
     twistedclover::initConstants(*param.gauge,profile);
     dslash_aux::initConstants(*param.gauge,profile);
   }
 
   DiracTwistedClover::DiracTwistedClover(const DiracTwistedClover &dirac) 
-    : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover), cloverInv(dirac.cloverInv)
+    : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover)
   {
     twistedclover::initConstants(*dirac.gauge,profile);
     dslash_aux::initConstants(*dirac.gauge,profile);
@@ -35,7 +35,6 @@ namespace quda {
       {
 	DiracWilson::operator=(dirac);
 	clover = dirac.clover;
-	cloverInv = dirac.cloverInv;
       }
 
     return *this;
@@ -61,12 +60,9 @@ namespace quda {
     if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS)
       {
 
-	FullClover *cs = new FullClover(clover);
-#ifndef DYNAMIC_CLOVER
-	FullClover *cI = new FullClover(cloverInv, false);
-#else
-	FullClover *cI = NULL;
-#endif
+	FullClover *cs = new FullClover(clover, false);
+	FullClover *cI = new FullClover(clover, true);
+
 	double flavor_mu = in.TwistFlavor() * mu;
 	twistCloverGamma5Cuda(&static_cast<cudaColorSpinorField&>(out),
 			      &static_cast<const cudaColorSpinorField&>(in),
@@ -78,9 +74,7 @@ namespace quda {
 	  flops += 552ll*in.Volume();
 
 	delete cs;
-#ifndef DYNAMIC_CLOVER
 	delete cI;
-#endif
       }
     else
       errorQuda("DiracTwistedClover::twistedCloverApply method for flavor doublet is not implemented..\n");
@@ -113,12 +107,8 @@ namespace quda {
 
     twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda      
 
-    FullClover *cs = new FullClover(clover);
-#ifndef DYNAMIC_CLOVER
-    FullClover *cI = new FullClover(cloverInv, false);
-#else
-    FullClover *cI = NULL;
-#endif
+    FullClover *cs = new FullClover(clover, false);
+    FullClover *cI = new FullClover(clover, true);
 
     if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){
       double a = 2.0 * kappa * in.TwistFlavor() * mu;//for direct twist (must be daggered separately)  
@@ -136,9 +126,7 @@ namespace quda {
     }
     deleteTmp(&tmp, reset);
     delete cs;
-#ifndef DYNAMIC_CLOVER
     delete cI;
-#endif
   }
 
   void DiracTwistedClover::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -172,7 +160,7 @@ namespace quda {
 
   void DiracTwistedClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
     double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor();
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID);
   }
 
   DiracTwistedCloverPC::DiracTwistedCloverPC(const DiracTwistedCloverPC &dirac) : DiracTwistedClover(dirac) { }
@@ -213,12 +201,8 @@ namespace quda {
 
     twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda
   
-    FullClover *cs = new FullClover(clover);
-#ifndef DYNAMIC_CLOVER
-    FullClover *cI = new FullClover(cloverInv, false);
-#else
-    FullClover *cI = NULL;
-#endif
+    FullClover *cs = new FullClover(clover, false);
+    FullClover *cI = new FullClover(clover, true);
 
     if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){
       double a = -2.0 * kappa * in.TwistFlavor() * mu;  //for invert twist (not daggered)
@@ -239,9 +223,7 @@ namespace quda {
       errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n");
     }
     delete cs;
-#ifndef DYNAMIC_CLOVER
     delete cI;
-#endif
   }
 
   // xpay version of the above
@@ -257,12 +239,8 @@ namespace quda {
 
     twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda
   
-    FullClover *cs = new FullClover(clover);
-#ifndef DYNAMIC_CLOVER
-    FullClover *cI = new FullClover(cloverInv, false);
-#else
-    FullClover *cI = NULL;
-#endif
+    FullClover *cs = new FullClover(clover, false);
+    FullClover *cI = new FullClover(clover, true);
 
     if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){
       double a = -2.0 * kappa * in.TwistFlavor() * mu;  //for invert twist
@@ -287,9 +265,7 @@ namespace quda {
       errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n");
     }
     delete cs;
-#ifndef DYNAMIC_CLOVER
     delete cI;
-#endif
   }
 
   void DiracTwistedCloverPC::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -298,12 +274,8 @@ namespace quda {
 
     bool reset = newTmp(&tmp1, in);
 
-    FullClover *cs = new FullClover(clover);
-#ifndef DYNAMIC_CLOVER
-    FullClover *cI = new FullClover(cloverInv, false);
-#else
-    FullClover *cI = NULL;
-#endif
+    FullClover *cs = new FullClover(clover, false);
+    FullClover *cI = new FullClover(clover, true);
 
     if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){
       if (matpcType == QUDA_MATPC_EVEN_EVEN) {
@@ -352,9 +324,7 @@ namespace quda {
     }
 
     delete cs;
-#ifndef DYNAMIC_CLOVER
     delete cI;
-#endif
 
     deleteTmp(&tmp1, reset);
   }
@@ -386,27 +356,27 @@ namespace quda {
       if (matpcType == QUDA_MATPC_EVEN_EVEN) {
         // src = A_ee^-1 (b_e + k D_eo A_oo^-1 b_o)
         src = &(x.Odd());
-        TwistCloverInv(*src, b.Odd(), 1);
+        TwistCloverInv(*src, b.Odd(), QUDA_ODD_PARITY);
         DiracWilson::DslashXpay(*tmp1, *src, QUDA_EVEN_PARITY, b.Even(), kappa);
-        TwistCloverInv(*src, *tmp1, 0);
+        TwistCloverInv(*src, *tmp1, QUDA_EVEN_PARITY);
         sol = &(x.Even());
       } else if (matpcType == QUDA_MATPC_ODD_ODD) {
         // src = A_oo^-1 (b_o + k D_oe A_ee^-1 b_e)
         src = &(x.Even());
-        TwistCloverInv(*src, b.Even(), 0);
+        TwistCloverInv(*src, b.Even(), QUDA_EVEN_PARITY);
         DiracWilson::DslashXpay(*tmp1, *src, QUDA_ODD_PARITY, b.Odd(), kappa);
-        TwistCloverInv(*src, *tmp1, 1);
+        TwistCloverInv(*src, *tmp1, QUDA_ODD_PARITY);
         sol = &(x.Odd());
       } else if (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
         // src = b_e + k D_eo A_oo^-1 b_o
         src = &(x.Odd());
-        TwistCloverInv(*tmp1, b.Odd(), 1); // safe even when *tmp1 = b.odd
+        TwistCloverInv(*tmp1, b.Odd(), QUDA_ODD_PARITY); // safe even when *tmp1 = b.odd
         DiracWilson::DslashXpay(*src, *tmp1, QUDA_EVEN_PARITY, b.Even(), kappa);
         sol = &(x.Even());
       } else if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
         // src = b_o + k D_oe A_ee^-1 b_e
         src = &(x.Even());
-        TwistCloverInv(*tmp1, b.Even(), 0); // safe even when *tmp1 = b.even
+        TwistCloverInv(*tmp1, b.Even(), QUDA_EVEN_PARITY); // safe even when *tmp1 = b.even
         DiracWilson::DslashXpay(*src, *tmp1, QUDA_ODD_PARITY, b.Odd(), kappa);
         sol = &(x.Odd());
       } else {
@@ -436,11 +406,11 @@ namespace quda {
       if (matpcType == QUDA_MATPC_EVEN_EVEN || matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
         // x_o = A_oo^-1 (b_o + k D_oe x_e)
         DiracWilson::DslashXpay(*tmp1, x.Even(), QUDA_ODD_PARITY, b.Odd(), kappa);
-        TwistCloverInv(x.Odd(), *tmp1, 1);
+        TwistCloverInv(x.Odd(), *tmp1, QUDA_ODD_PARITY);
       } else if (matpcType == QUDA_MATPC_ODD_ODD ||   matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
         // x_e = A_ee^-1 (b_e + k D_eo x_o)
         DiracWilson::DslashXpay(*tmp1, x.Odd(), QUDA_EVEN_PARITY, b.Even(), kappa);
-        TwistCloverInv(x.Even(), *tmp1, 0);
+        TwistCloverInv(x.Even(), *tmp1, QUDA_EVEN_PARITY);
       } else {
         errorQuda("MatPCType %d not valid for DiracTwistedCloverPC", matpcType);
       }
@@ -452,6 +422,6 @@ namespace quda {
 
   void DiracTwistedCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
     double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor();
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType);
   }
 } // namespace quda
diff --git a/lib/dirac_twisted_mass.cpp b/lib/dirac_twisted_mass.cpp
index 25088fbeb8..36c404067b 100644
--- a/lib/dirac_twisted_mass.cpp
+++ b/lib/dirac_twisted_mass.cpp
@@ -189,7 +189,7 @@ namespace quda {
   void DiracTwistedMass::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
     double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor();
     cudaCloverField *c = NULL;
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID);
   }
 
   DiracTwistedMassPC::DiracTwistedMassPC(const DiracTwistedMassPC &dirac) : DiracTwistedMass(dirac) { }
@@ -548,6 +548,6 @@ namespace quda {
   void DiracTwistedMassPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
     double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor();
     cudaCloverField *c = NULL;
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType);
   }
 } // namespace quda
diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp
index 5b66da3077..cc27de9134 100644
--- a/lib/dirac_wilson.cpp
+++ b/lib/dirac_wilson.cpp
@@ -158,7 +158,7 @@ namespace quda {
 
   void DiracWilson::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const {
     cudaCloverField *c = NULL;
-    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID);
+    CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID);
   }
 
   DiracWilsonPC::DiracWilsonPC(const DiracParam &param)
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index a7e50db23c..64dab6ebd6 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -130,10 +130,6 @@ cudaCloverField *cloverPrecise = NULL;
 cudaCloverField *cloverSloppy = NULL;
 cudaCloverField *cloverPrecondition = NULL;
 
-cudaCloverField *cloverInvPrecise = NULL;
-cudaCloverField *cloverInvSloppy = NULL;
-cudaCloverField *cloverInvPrecondition = NULL;
-
 cudaGaugeField *momResident = NULL;
 cudaGaugeField *extendedGaugeResident = NULL;
 
@@ -699,6 +695,8 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 
 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
+  if (!gaugePrecise) errorQuda("Cannot call loadCloverQuda with no resident gauge field");
+
   profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   bool device_calc = false; // calculate clover and inverse on the device?
 
@@ -707,23 +705,16 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 
   if (!initialized) errorQuda("QUDA not initialized");
 
-  if (!h_clover && !h_clovinv) {
-    if(inv_param->clover_coeff != 0){
-      device_calc = true;
-    }else{
-      errorQuda("loadCloverQuda() called with neither clover term nor inverse");
-    }
+  if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) {
+    device_calc = true;
+    if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set");
+    if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field");
   }
 
-
-  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
-    errorQuda("Half precision not supported on CPU");
-  }
-  if (gaugePrecise == NULL) {
-    errorQuda("Gauge field must be loaded before clover");
-  }
+  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION)  errorQuda("Half precision not supported on CPU");
+  if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover");
   if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) {
-    errorQuda("Wrong dslash_type in loadCloverQuda()");
+    errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type);
   }
 
   // determines whether operator is preconditioned when calling invertQuda()
@@ -751,243 +742,151 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
     warningQuda("Uninverted clover term not loaded");
   }
 
-  CloverFieldParam clover_param;
-  CloverField *in=NULL;
-#ifndef DYNAMIC_CLOVER
-  CloverField *inInv=NULL;
+  bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false;
+#ifdef DYNAMIC_CLOVER
+  bool dynamic_clover = twisted ? true : false; // dynamic clover only supported on twisted clover currently
+#else
+  bool dynamic_clover = false;
 #endif
 
-  if(!device_calc){
-    // create a param for the cpu clover field
-    profileClover.TPSTART(QUDA_PROFILE_INIT);
-    CloverFieldParam cpuParam;
-    cpuParam.nDim = 4;
-    for (int i=0; i<4; i++) cpuParam.x[i] = gaugePrecise->X()[i];
-    cpuParam.precision = inv_param->clover_cpu_prec;
-    cpuParam.order = inv_param->clover_order;
-    cpuParam.direct = h_clover ? true : false;
-    cpuParam.inverse = h_clovinv ? true : false;
-    cpuParam.clover = h_clover;
-    cpuParam.norm = 0;
-    cpuParam.cloverInv = h_clovinv;
-    cpuParam.invNorm = 0;
-    cpuParam.create = QUDA_REFERENCE_FIELD_CREATE;
-    cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-    cpuParam.twisted = false;
-    cpuParam.mu2 = 0.;
-
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      cpuParam.direct = true;
-      cpuParam.inverse = false;
-      cpuParam.cloverInv = NULL;
-      cpuParam.clover = h_clover;
-      cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;
-      in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
-        static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
-        static_cast<CloverField*>(new cudaCloverField(cpuParam));
-
-#ifndef DYNAMIC_CLOVER
-      cpuParam.cloverInv = h_clovinv;
-      cpuParam.clover = NULL;
-      cpuParam.twisted = true;
-      cpuParam.direct = true;
-      cpuParam.inverse = false;
-      cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;
-
-      inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
-        static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
-        static_cast<CloverField*>(new cudaCloverField(cpuParam));
-#endif
-    } else {
-      in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
-        static_cast<CloverField*>(new cpuCloverField(cpuParam)) :
-        static_cast<CloverField*>(new cudaCloverField(cpuParam));
-    }
+  CloverFieldParam clover_param;
+  clover_param.nDim = 4;
+  clover_param.twisted = twisted;
+  clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0;
+  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
+  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
+  clover_param.pad = inv_param->cl_pad;
+  clover_param.create = QUDA_NULL_FIELD_CREATE;
+  clover_param.norm = nullptr;
+  clover_param.invNorm = nullptr;
+  clover_param.setPrecision(inv_param->clover_cuda_prec);
+  clover_param.direct = h_clover || device_calc ? true : false;
+  clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false;
 
-    clover_param.nDim = 4;
-    for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
-    clover_param.setPrecision(inv_param->clover_cuda_prec);
-    clover_param.pad = inv_param->cl_pad;
-    clover_param.direct = h_clover ? true : false;
-    clover_param.inverse = (h_clovinv || pc_solve) ? true : false;
-    clover_param.create = QUDA_NULL_FIELD_CREATE;
-    clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
-
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      clover_param.direct = true;
-      clover_param.inverse = false;
-      clover_param.twisted = true;
-      cloverPrecise = new cudaCloverField(clover_param);
-#ifndef DYNAMIC_CLOVER
-      clover_param.direct = false;
-      clover_param.inverse = true;
-      clover_param.twisted = true;
-      cloverInvPrecise = new cudaCloverField(clover_param);
-//      clover_param.twisted = false;
-#endif
-    } else {
-      cloverPrecise = new cudaCloverField(clover_param);
-    }
+  cloverPrecise = new cudaCloverField(clover_param);
+
+  CloverField *in = nullptr;
 
+  if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
+    // create a param for the cpu clover field
+    profileClover.TPSTART(QUDA_PROFILE_INIT);
+    CloverFieldParam inParam(clover_param);
+    inParam.precision = inv_param->clover_cpu_prec;
+    inParam.order = inv_param->clover_order;
+    inParam.direct = h_clover ? true : false;
+    inParam.inverse = h_clovinv ? true : false;
+    inParam.clover = h_clover;
+    inParam.cloverInv = h_clovinv;
+    inParam.create = QUDA_REFERENCE_FIELD_CREATE;
+    in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ?
+      static_cast<CloverField*>(new cpuCloverField(inParam)) :
+      static_cast<CloverField*>(new cudaCloverField(inParam));
     profileClover.TPSTOP(QUDA_PROFILE_INIT);
+  }
 
+  if (!device_calc) {
     profileClover.TPSTART(QUDA_PROFILE_H2D);
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      cloverPrecise->copy(*in, false);
-#ifndef DYNAMIC_CLOVER
-      cloverInvPrecise->copy(*in, true);
-      cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
-#endif
-    } else {
-      cloverPrecise->copy(*in, h_clovinv ? true : false);
-    }
-
+    cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false);
     profileClover.TPSTOP(QUDA_PROFILE_H2D);
   } else {
     profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
-
     createCloverQuda(inv_param);
-
-#ifndef DYNAMIC_CLOVER
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
-      if (inv_param->compute_clover_trlog) {
-        inv_param->trlogA[0] = cloverInvPrecise->TrLog()[0];
-        inv_param->trlogA[1] = cloverInvPrecise->TrLog()[1];
-      }
-    }
-#endif
     profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   // inverted clover term is required when applying preconditioned operator
-  if ((!h_clovinv && pc_solve) && inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) {
+  if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) {
     profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
-    cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
-    profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
-    if (inv_param->compute_clover_trlog) {
-      inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
-      inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
+    if (!dynamic_clover) {
+      cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
+      if (inv_param->compute_clover_trlog) {
+	inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
+	inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
+      }
     }
+    profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
-#ifndef DYNAMIC_CLOVER
-  if (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)
-    inv_param->cloverGiB = cloverPrecise->GBytes();
-  else
-    inv_param->cloverGiB = cloverPrecise->GBytes() + cloverInvPrecise->GBytes();
-#else
   inv_param->cloverGiB = cloverPrecise->GBytes();
-#endif
 
-  clover_param.norm    = 0;
-  clover_param.invNorm = 0;
-  clover_param.mu2 = 0.;
-  clover_param.nDim = 4;
-  for(int dir=0; dir<4; ++dir) clover_param.x[dir] = gaugePrecise->X()[dir];
-  clover_param.pad = inv_param->cl_pad;
-  clover_param.siteSubset = QUDA_FULL_SITE_SUBSET;
-  clover_param.create = QUDA_NULL_FIELD_CREATE;
   clover_param.direct = true;
-  clover_param.inverse = true;
+  clover_param.inverse = dynamic_clover ? false : true;
 
   // create the mirror sloppy clover field
   if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
     profileClover.TPSTART(QUDA_PROFILE_INIT);
+
     clover_param.setPrecision(inv_param->clover_cuda_prec_sloppy);
+    cloverSloppy = new cudaCloverField(clover_param);
+    cloverSloppy->copy(*cloverPrecise, clover_param.inverse);
 
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu;
-      clover_param.twisted = true;
-#ifndef DYNAMIC_CLOVER
-      clover_param.direct = false;
-      clover_param.inverse = true;
-      cloverInvSloppy = new cudaCloverField(clover_param);
-      cloverInvSloppy->copy(*cloverInvPrecise, true);
-      clover_param.direct = true;
-      clover_param.inverse = false;
-      inv_param->cloverGiB += cloverInvSloppy->GBytes();
-#endif
-      cloverSloppy = new cudaCloverField(clover_param);
-      cloverSloppy->copy(*cloverPrecise);
-      inv_param->cloverGiB += cloverSloppy->GBytes();
-    } else {
-      cloverSloppy = new cudaCloverField(clover_param);
-      cloverSloppy->copy(*cloverPrecise);
-      inv_param->cloverGiB += cloverSloppy->GBytes();
-    }
+    inv_param->cloverGiB += cloverSloppy->GBytes();
     profileClover.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     cloverSloppy = cloverPrecise;
-#ifndef DYNAMIC_CLOVER
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-      cloverInvSloppy = cloverInvPrecise;
-#endif
   }
 
   // create the mirror preconditioner clover field
   if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition &&
       inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) {
     profileClover.TPSTART(QUDA_PROFILE_INIT);
+
     clover_param.setPrecision(inv_param->clover_cuda_prec_precondition);
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      clover_param.direct = true;
-      clover_param.inverse = false;
-      cloverPrecondition = new cudaCloverField(clover_param);
-      cloverPrecondition->copy(*cloverSloppy);
-      inv_param->cloverGiB += cloverPrecondition->GBytes();
-#ifndef DYNAMIC_CLOVER
-      clover_param.direct = false;
-      clover_param.inverse = true;
-      clover_param.twisted = true;
-      cloverInvPrecondition = new cudaCloverField(clover_param);
-      cloverInvPrecondition->copy(*cloverInvSloppy, true);
-      inv_param->cloverGiB += cloverInvPrecondition->GBytes();
-#endif
-    } else {
-      cloverPrecondition = new cudaCloverField(clover_param);
-      cloverPrecondition->copy(*cloverSloppy);
-      inv_param->cloverGiB += cloverPrecondition->GBytes();
-    }
+    cloverPrecondition = new cudaCloverField(clover_param);
+    cloverPrecondition->copy(*cloverSloppy, clover_param.inverse);
+
+    inv_param->cloverGiB += cloverPrecondition->GBytes();
     profileClover.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     cloverPrecondition = cloverSloppy;
-#ifndef DYNAMIC_CLOVER
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-      cloverInvPrecondition = cloverInvSloppy;
-#endif
   }
 
-  // need to copy back the odd inverse field into the application clover field
-  if (!h_clovinv && pc_solve && !device_calc) {
+  // if requested, copy back the clover / inverse field
+  if ( inv_param->return_clover || inv_param->return_clover_inverse ) {
+    if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set");
+
     // copy the inverted clover term into host application order on the device
     clover_param.setPrecision(inv_param->clover_cpu_prec);
-    clover_param.direct = false;
-    clover_param.inverse = true;
-    clover_param.order = inv_param->clover_order;
+    clover_param.direct = (h_clover && inv_param->return_clover);
+    clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse);
 
     // this isn't really "epilogue" but this label suffices
     profileClover.TPSTART(QUDA_PROFILE_EPILOGUE);
-    cudaCloverField hack(clover_param);
-    hack.copy(*cloverPrecise);
+    cudaCloverField *hack = nullptr;
+    if (!dynamic_clover) {
+      clover_param.order = inv_param->clover_order;
+      hack = new cudaCloverField(clover_param);
+      hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
+    } else {
+      cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param);	// Hack of the hack
+      hackOfTheHack->copy(*cloverPrecise, false);
+      cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION);
+      if (inv_param->compute_clover_trlog) {
+	inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
+	inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
+      }
+      clover_param.order = inv_param->clover_order;
+      hack = new cudaCloverField(clover_param);
+      hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse
+      delete hackOfTheHack;
+    }
     profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-    // copy the odd components into the host application's clover field
+    // copy the field into the host application's clover field
     profileClover.TPSTART(QUDA_PROFILE_D2H);
-    qudaMemcpy((char*)(in->V(false))+in->Bytes()/2, (char*)(hack.V(true))+hack.Bytes()/2,
-        in->Bytes()/2, cudaMemcpyDeviceToHost);
+    if (inv_param->return_clover) {
+      qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost);
+    }
+    if (inv_param->return_clover_inverse) {
+      qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost);
+    }
     profileClover.TPSTOP(QUDA_PROFILE_D2H);
 
+    delete hack;
     checkCudaError();
   }
 
-  if(!device_calc)
-  {
-    if (in) delete in; // delete object referencing input field
-#ifndef DYNAMIC_CLOVER
-    if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv;
-#endif
-  }
+  if (in) delete in; // delete object referencing input field
 
   popVerbosity();
 
@@ -1174,16 +1073,6 @@ void freeCloverQuda(void)
   cloverPrecondition = NULL;
   cloverSloppy = NULL;
   cloverPrecise = NULL;
-
-  if (cloverInvPrecise != NULL) {
-     if (cloverInvPrecondition != cloverInvSloppy && cloverInvPrecondition) delete cloverInvPrecondition;
-     if (cloverInvSloppy != cloverInvPrecise && cloverInvSloppy) delete cloverInvSloppy;
-     if (cloverInvPrecise) delete cloverInvPrecise;
-
-     cloverInvPrecondition = NULL;
-     cloverInvSloppy = NULL;
-     cloverInvPrecise = NULL;
-  }
 }
 
 void endQuda(void)
@@ -1358,7 +1247,6 @@ namespace quda {
     diracParam.fatGauge = gaugeFatPrecise;
     diracParam.longGauge = gaugeLongPrecise;
     diracParam.clover = cloverPrecise;
-    diracParam.cloverInv = cloverInvPrecise;
     diracParam.kappa = kappa;
     diracParam.mass = inv_param->mass;
     diracParam.m5 = inv_param->m5;
@@ -1376,7 +1264,6 @@ namespace quda {
     diracParam.fatGauge = gaugeFatSloppy;
     diracParam.longGauge = gaugeLongSloppy;
     diracParam.clover = cloverSloppy;
-    diracParam.cloverInv = cloverInvSloppy;
 
     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
@@ -1399,7 +1286,6 @@ namespace quda {
       diracParam.longGauge = gaugeLongPrecondition;
     }
     diracParam.clover = cloverPrecondition;
-    diracParam.cloverInv = cloverInvPrecondition;
 
     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = comms ? 1 : 0;
@@ -1524,8 +1410,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
-  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-    errorQuda("Clover field not allocated");
 
   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
@@ -1748,8 +1632,6 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
-  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-    errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
@@ -1821,8 +1703,6 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   if (gaugePrecise == NULL) errorQuda("Gauge field not allocated");
   if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
-  if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-    errorQuda("Clover field not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
@@ -2901,7 +2781,6 @@ void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h
   diracHalfPrecParam.longGauge = gaugeLongPrecondition;
 
   diracHalfPrecParam.clover = cloverPrecondition;
-  diracHalfPrecParam.cloverInv = cloverInvPrecondition;
 
   for (int i=0; i<4; i++) {
       diracHalfPrecParam.commDim[i] = 1; // comms are on.
@@ -3538,37 +3417,7 @@ void createCloverQuda(QudaInvertParam* invertParam)
 {
   profileCloverCreate.TPSTART(QUDA_PROFILE_TOTAL);
   profileCloverCreate.TPSTART(QUDA_PROFILE_INIT);
-  if(!cloverPrecise){
-    CloverFieldParam cloverParam;
-    cloverParam.nDim = 4;
-    for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir];
-    cloverParam.setPrecision(invertParam->clover_cuda_prec);
-    cloverParam.pad = invertParam->cl_pad;
-    cloverParam.direct = true;
-    cloverParam.inverse = true;
-    cloverParam.norm    = 0;
-    cloverParam.invNorm = 0;
-    cloverParam.twisted = false;
-    cloverParam.create = QUDA_NULL_FIELD_CREATE;
-    cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-    cloverParam.setPrecision(invertParam->cuda_prec);
-    if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
-    {
-
-      cloverParam.twisted = true;
-      cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu;
-      cloverParam.direct = true;
-      cloverParam.inverse = false;
-      cloverPrecise = new cudaCloverField(cloverParam);
-#ifndef DYNAMIC_CLOVER
-      cloverParam.inverse = true;
-      cloverParam.direct = false;
-      cloverInvPrecise = new cudaCloverField(cloverParam);	//FIXME Only with tmClover
-#endif
-    } else {
-      cloverPrecise = new cudaCloverField(cloverParam);
-    }
-  }
+  if (!cloverPrecise) errorQuda("Clover field not allocated");
 
   int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction
   int y[4];
@@ -3595,36 +3444,11 @@ void createCloverQuda(QudaInvertParam* invertParam)
 
     // copy gaugePrecise into the extended device gauge field
     copyExtendedGauge(*cudaGaugeExtended, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION);
-#if 1
-    profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT);
-    profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS);
-    cudaGaugeExtended->exchangeExtendedGhost(R,true);
-    profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS);
-#else
-
-    GaugeFieldParam gParam(gaugePrecise->X(), gaugePrecise->Precision(), QUDA_RECONSTRUCT_NO,
-        pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO);
-    gParam.create = QUDA_ZERO_FIELD_CREATE;
-    gParam.order = QUDA_MILC_GAUGE_ORDER;
-    gParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-    gParam.t_boundary = gaugePrecise->TBoundary();
-    gParam.nFace = 1;
-
-    // create an extended gauge field on the host
-    for(int dir=0; dir<4; ++dir) gParam.x[dir] += 4;
-    cpuGaugeField cpuGaugeExtended(gParam);
-    cudaGaugeExtended->saveCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);
 
     profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT);
-    // communicate data
     profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS);
-    //exchange_cpu_sitelink_ex(const_cast<int*>(gaugePrecise->X()), R, (void**)cpuGaugeExtended.Gauge_p(),
-    //			   cpuGaugeExtended.Order(),cpuGaugeExtended.Precision(), 0, 4);
-    cpuGaugeExtended.exchangeExtendedGhost(R,true);
-
-    cudaGaugeExtended->loadCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION);
+    cudaGaugeExtended->exchangeExtendedGhost(R,true);
     profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS);
-#endif
   }
 
 #ifdef MULTI_GPU
@@ -3633,7 +3457,6 @@ void createCloverQuda(QudaInvertParam* invertParam)
   GaugeField *gauge = gaugePrecise;
 #endif
 
-
   profileCloverCreate.TPSTART(QUDA_PROFILE_INIT);
   // create the Fmunu field
   GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY);
@@ -3644,16 +3467,8 @@ void createCloverQuda(QudaInvertParam* invertParam)
   profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT);
 
   profileCloverCreate.TPSTART(QUDA_PROFILE_COMPUTE);
-
   computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION);
   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION);
-
-#ifndef DYNAMIC_CLOVER
-  if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-    computeClover(*cloverInvPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); // FIXME only with tmClover
-  }
-#endif
-
   profileCloverCreate.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   profileCloverCreate.TPSTOP(QUDA_PROFILE_TOTAL);
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 7889039fc4..16e8906de9 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -536,7 +536,7 @@ namespace quda {
     } else if (!tuning) {
 
       TuneParam best_param;
-      cudaError_t error;
+      cudaError_t error = cudaSuccess;
       cudaEvent_t start, end;
       float elapsed_time, best_time;
       time_t now;
@@ -599,7 +599,7 @@ namespace quda {
       }
 
       if (best_time == FLT_MAX) {
-	errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume);
+	errorQuda("Auto-tuning failed for %s with %s at vol=%s, error %s", key.name, key.aux, key.volume, cudaGetErrorString(error));
       }
       if (verbosity >= QUDA_VERBOSE) {
 	printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(),
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4c2a793310..bfd8d08ef8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -26,11 +26,11 @@ endif()
 
 #define tests
 
-if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_DOMAIN_WALL})
-  cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp)
+if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_CLOVER} OR ${QUDA_DIRAC_TWISTED_MASS} OR ${QUDA_DIRAC_TWISTED_CLOVER} OR ${QUDA_DIRAC_DOMAIN_WALL})
+  cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp)
   target_link_libraries(dslash_test ${TEST_LIBS} )
 
-  cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp)
+  cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp)
   target_link_libraries(invert_test ${TEST_LIBS})
 endif()
 
@@ -38,15 +38,15 @@ cuda_add_executable(deflation_test deflation_test.cpp wilson_dslash_reference.cp
 target_link_libraries(deflation_test ${TEST_LIBS})
 
 if(${QUDA_DIRAC_STAGGERED})
-  cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp   staggered_dslash_reference.cpp)
+  cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp staggered_dslash_reference.cpp blas_reference.cpp)
   target_link_libraries(staggered_dslash_test ${TEST_LIBS})
 
-  cuda_add_executable(staggered_invert_test staggered_invert_test.cpp  staggered_dslash_reference.cpp  blas_reference.cpp)
+  cuda_add_executable(staggered_invert_test staggered_invert_test.cpp staggered_dslash_reference.cpp blas_reference.cpp)
   target_link_libraries(staggered_invert_test ${TEST_LIBS})
 endif()
 
 if(${QUDA_MULTIGRID})
-  cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp)
+  cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp clover_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp)
   target_link_libraries(multigrid_invert_test ${TEST_LIBS})
 
   cuda_add_executable(multigrid_benchmark_test multigrid_benchmark_test.cu)
diff --git a/tests/Makefile b/tests/Makefile
index c37e85ddc8..f948fa13ba 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -56,13 +56,13 @@ TESTS = su3_test pack_test blas_test dslash_test invert_test		\
 
 all: $(TESTS)
 
-dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o domain_wall_dslash_reference.o misc.o $(QUDA)
+dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS)
 
-invert_test: invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
+invert_test: invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS)
 
-multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
+multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS)
 
 multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA)
@@ -71,7 +71,7 @@ multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA)
 deflation_test: deflation_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS)
 
-staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o $(QUDA)
+staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) 
 
 staggered_invert_test: staggered_invert_test.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA)
diff --git a/tests/blas_reference.cpp b/tests/blas_reference.cpp
index 1651734150..056926d1ef 100644
--- a/tests/blas_reference.cpp
+++ b/tests/blas_reference.cpp
@@ -50,6 +50,18 @@ double norm_2(void *v, int len, QudaPrecision precision) {
   else return norm2((float*)v, len);
 }
 
+// performs the operation y[i] = x[i] + a*y[i]
+template <typename Float>
+static inline void xpay(Float *x, Float a, Float *y, int len) {
+  for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
+}
+
+void xpay(void *x, double a, void *y, int length, QudaPrecision precision) {
+  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)x, a, (double*)y, length);
+  else xpay((float*)x, (float)a, (float*)y, length);
+}
+
+
 
 /*
 
diff --git a/tests/blas_reference.h b/tests/blas_reference.h
index cf7aa0e00e..d36a1b5f95 100644
--- a/tests/blas_reference.h
+++ b/tests/blas_reference.h
@@ -12,6 +12,8 @@ extern "C" {
   void mxpy(void *x, void *y, int len, QudaPrecision precision);
   void ax(double a, void *x, int len, QudaPrecision precision);
   void axpy(double a, void *x, void *y, int len, QudaPrecision precision);
+  void xpay(void *x, double a, void *y, int len, QudaPrecision precision);
+
   /*  void zero(float* a, int cnt);
       void copy(float* a, float *b, int len);*/
   
diff --git a/tests/clover_reference.cpp b/tests/clover_reference.cpp
new file mode 100644
index 0000000000..c108185ad1
--- /dev/null
+++ b/tests/clover_reference.cpp
@@ -0,0 +1,341 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <complex>
+
+#include <util_quda.h>
+#include <test_util.h>
+#include <wilson_dslash_reference.h>
+#include <blas_reference.h>
+
+
+/**
+   @brief Apply the clover matrix field
+   @param[out] out Result field (single parity)
+   @param[in] clover Clover-matrix field (full field)
+   @param[in] in Input field (single parity)
+   @param[in] parity Parity to which we are applying the clover field
+ */
+template <typename sFloat, typename cFloat>
+void cloverReference(sFloat *out, cFloat *clover, sFloat *in, int parity) {
+  int nSpin = 4;
+  int nColor = 3;
+  int N = nColor * nSpin / 2;
+  int chiralBlock = N + 2*(N-1)*N/2;
+
+  for (int i=0; i<Vh; i++) {
+    std::complex<sFloat> *In = reinterpret_cast<std::complex<sFloat>*>(&in[i*nSpin*nColor*2]);
+    std::complex<sFloat> *Out = reinterpret_cast<std::complex<sFloat>*>(&out[i*nSpin*nColor*2]);
+  
+    for (int chi=0; chi<nSpin/2; chi++) {
+      cFloat *D = &clover[((parity*Vh + i)*2 + chi)*chiralBlock];
+      std::complex<cFloat> *L = reinterpret_cast<std::complex<cFloat>*>(&D[N]);
+
+      for (int s_col=0; s_col<nSpin/2; s_col++) { // 2 spins per chiral block
+	for (int c_col=0; c_col<nColor; c_col++) {
+	  const int col = s_col * nColor + c_col;
+	  const int Col = chi*N + col;
+	  Out[Col] = 0.0;
+
+	  for (int s_row=0; s_row<nSpin/2; s_row++) { // 2 spins per chiral block
+	    for (int c_row=0; c_row<nColor; c_row++) {
+	      const int row = s_row * nColor + c_row;
+	      const int Row = chi*N + row;
+
+	      if (row == col) {
+		Out[Col] += D[row] * In[Row];
+	      } else if (col < row) {
+		int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
+		Out[Col] += conj(L[k]) * In[Row];
+	      } else if (row < col) {
+		int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;		
+		Out[Col] += L[k] * In[Row];
+	      }
+	    }
+	  }
+
+	}
+      }
+      
+    }
+
+  }
+
+}
+
+void apply_clover(void *out, void *clover, void *in, int parity, QudaPrecision precision) {
+
+  switch (precision) {
+  case QUDA_DOUBLE_PRECISION:
+    cloverReference(static_cast<double*>(out), static_cast<double*>(clover), static_cast<double*>(in), parity);
+    break;
+  case QUDA_SINGLE_PRECISION:
+    cloverReference(static_cast<float*>(out), static_cast<float*>(clover), static_cast<float*>(in), parity);
+    break;
+  default:
+    errorQuda("Unsupported precision %d", precision);
+  }
+
+}
+
+void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity,
+		   int dagger, QudaPrecision precision, QudaGaugeParam &param) {
+  void *tmp = malloc(Vh*spinorSiteSize*precision);
+
+  wil_dslash(tmp, gauge, in, parity, dagger, precision, param);
+  apply_clover(out, clover, tmp, parity, precision);
+
+  free(tmp);
+}
+
+// Apply the even-odd preconditioned Wilson-clover operator
+void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, 
+		  QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) {
+
+  double kappa2 = -kappa*kappa;
+  void *tmp = malloc(Vh*spinorSiteSize*precision);
+    
+  switch(matpc_type) {
+  case QUDA_MATPC_EVEN_EVEN:
+    if (!dagger) {
+      wil_dslash(tmp, gauge, in, 1, dagger, precision, gauge_param);
+      apply_clover(out, clover_inv, tmp, 1, precision);
+      wil_dslash(tmp, gauge, out, 0, dagger, precision, gauge_param);
+      apply_clover(out, clover_inv, tmp, 0, precision);
+    } else {
+      apply_clover(tmp, clover_inv, in, 0, precision);
+      wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param);
+      apply_clover(tmp, clover_inv, out, 1, precision);
+      wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param);
+    }
+    xpay(in, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC:
+    wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param);
+    apply_clover(tmp, clover_inv, out, 1, precision);
+    wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param);
+    apply_clover(tmp, clover, in, 0, precision);
+    xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_ODD_ODD:
+    if (!dagger) {
+      wil_dslash(tmp, gauge, in, 0, dagger, precision, gauge_param);
+      apply_clover(out, clover_inv, tmp, 0, precision);
+      wil_dslash(tmp, gauge, out, 1, dagger, precision, gauge_param);
+      apply_clover(out, clover_inv, tmp, 1, precision);
+    } else {
+      apply_clover(tmp, clover_inv, in, 1, precision);
+      wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param);
+      apply_clover(tmp, clover_inv, out, 0, precision);
+      wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param);
+    }
+    xpay(in, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_ODD_ODD_ASYMMETRIC:
+    wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param);
+    apply_clover(tmp, clover_inv, out, 0, precision);
+    wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param);
+    apply_clover(tmp, clover, in, 1, precision);
+    xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  default:
+    errorQuda("Unsupoorted matpc=%d", matpc_type);
+  }
+
+  free(tmp);
+}
+
+// Apply the full Wilson-clover operator
+void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, 
+		int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) {
+
+  void *tmp = malloc(V*spinorSiteSize*precision);
+
+  void *inEven = in;
+  void *inOdd  = (char*)in + Vh*spinorSiteSize*precision;
+  void *outEven = out;
+  void *outOdd = (char*)out + Vh*spinorSiteSize*precision;
+  void *tmpEven = tmp;
+  void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision;
+
+  // Odd part
+  wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param);
+  apply_clover(tmpOdd, clover, inOdd, 1, precision);
+
+  // Even part
+  wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param);
+  apply_clover(tmpEven, clover, inEven, 0, precision);
+
+  // lastly apply the kappa term
+  xpay(tmp, -kappa, out, V*spinorSiteSize, precision);
+
+  free(tmp);
+}
+
+void applyTwist(void *out, void *in, void *tmpH, double a, QudaPrecision precision) {
+  switch (precision) {
+  case QUDA_DOUBLE_PRECISION:
+    for(int i = 0; i < Vh; i++)
+      for(int s = 0; s < 4; s++) {
+        double a5 = ((s / 2) ? -1.0 : +1.0) * a;
+        for(int c = 0; c < 3; c++) {
+          ((double *) out)[i * 24 + s * 6 + c * 2 + 0] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((double *) in)[i * 24 + s * 6 + c * 2 + 1];
+          ((double *) out)[i * 24 + s * 6 + c * 2 + 1] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((double *) in)[i * 24 + s * 6 + c * 2 + 0];
+        }
+      }
+    break;
+  case QUDA_SINGLE_PRECISION:
+    for(int i = 0; i < Vh; i++)
+      for(int s = 0; s < 4; s++) {
+        float a5 = ((s / 2) ? -1.0 : +1.0) * a;
+        for(int c = 0; c < 3; c++) {
+          ((float *) out)[i * 24 + s * 6 + c * 2 + 0] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((float *) in)[i * 24 + s * 6 + c * 2 + 1];
+          ((float *) out)[i * 24 + s * 6 + c * 2 + 1] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((float *) in)[i * 24 + s * 6 + c * 2 + 0];
+        }
+      }
+    break;
+  default:
+    errorQuda("Unsupported precision %d", precision);
+  }
+}
+
+// Apply (C + i*a*gamma_5)/(C^2 + a^2)
+void twistCloverGamma5(void *out, void *in, void *clover, void *cInv, const int dagger, const double kappa, const double mu,
+		       const QudaTwistFlavorType flavor, const int parity, QudaTwistGamma5Type twist, QudaPrecision precision) {
+  void *tmp1 = malloc(Vh*spinorSiteSize*precision);
+  void *tmp2 = malloc(Vh*spinorSiteSize*precision);
+
+  double a = 0.0;
+
+  if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
+    a = 2.0 * kappa * mu * flavor;
+
+    if (dagger) a *= -1.0;
+
+    apply_clover(tmp1, clover, in, parity, precision);
+    applyTwist(out, in, tmp1, a, precision);
+  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
+    a = -2.0 * kappa * mu * flavor;
+
+    if (dagger) a *= -1.0;
+
+    apply_clover(tmp1, clover, in, parity, precision);
+    applyTwist(tmp2, in, tmp1, a, precision);
+    apply_clover(out, cInv, tmp2, parity, precision);
+  } else {
+    printf("Twist type %d not defined\n", twist);
+    exit(0);
+  }
+
+  free(tmp2);
+  free(tmp1);
+}
+
+void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor,
+		int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &param) {
+  void *tmp1 = malloc(Vh*spinorSiteSize*precision);
+  void *tmp2 = malloc(Vh*spinorSiteSize*precision);
+
+  if (dagger) {
+    twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1-parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
+      wil_dslash(tmp2, gauge, tmp1, parity, dagger, precision, param);
+      twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    } else {
+      wil_dslash(out, gauge, tmp1, parity, dagger, precision, param);
+    } 
+  } else {
+    wil_dslash(tmp1, gauge, in, parity, dagger, precision, param);
+    twistCloverGamma5(out, tmp1, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
+  }
+
+  free(tmp2);
+  free(tmp1);
+}
+
+// Apply the full twisted-clover operator
+void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu,
+	     QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) {
+
+  void *tmp = malloc(V*spinorSiteSize*precision);
+
+  void *inEven = in;
+  void *inOdd  = (char*)in + Vh*spinorSiteSize*precision;
+  void *outEven = out;
+  void *outOdd = (char*)out + Vh*spinorSiteSize*precision;
+  void *tmpEven = tmp;
+  void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision;
+
+  // Odd part
+  wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param);
+  twistCloverGamma5(tmpOdd, inOdd, clover, NULL, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision);
+
+  // Even part
+  wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param);
+  twistCloverGamma5(tmpEven, inEven, clover, NULL, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision);
+
+  // lastly apply the kappa term
+  xpay(tmp, -kappa, out, V*spinorSiteSize, precision);
+
+  free(tmp);
+}
+
+// Apply the even-odd preconditioned Dirac operator
+void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor,
+              QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) {
+
+  double kappa2 = -kappa*kappa;
+
+  void *tmp1 = malloc(Vh*spinorSiteSize*precision);
+  void *tmp2 = malloc(Vh*spinorSiteSize*precision);
+
+  switch(matpc_type) {
+  case QUDA_MATPC_EVEN_EVEN:
+    if (!dagger) {
+      wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param);
+      twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(tmp2, gauge, tmp1, 0, dagger, precision, gauge_param);
+      twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    } else {
+      twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(tmp1, gauge, out, 1, dagger, precision, gauge_param);
+      twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param);
+    }
+    xpay(in, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC:
+    wil_dslash(tmp1, gauge, in, 1, dagger, precision, gauge_param);
+    twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param);
+    twistCloverGamma5(tmp2, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision);
+    xpay(tmp2, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_ODD_ODD:
+    if (!dagger) {
+      wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param);
+      twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(tmp2, gauge, tmp1, 1, dagger, precision, gauge_param);
+      twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    } else {
+      twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(tmp1, gauge, out, 0, dagger, precision, gauge_param);
+      twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision);
+      wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param);
+    }
+    xpay(in, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  case QUDA_MATPC_ODD_ODD_ASYMMETRIC:
+    wil_dslash(tmp1, gauge, in, 0, dagger, precision, gauge_param);
+    twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param);
+    twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision);
+    xpay(tmp1, kappa2, out, Vh*spinorSiteSize, precision);
+    break;
+  default:
+    errorQuda("Unsupported matpc=%d", matpc_type);
+  }
+
+  free(tmp2);
+  free(tmp1);
+}
diff --git a/tests/domain_wall_dslash_reference.cpp b/tests/domain_wall_dslash_reference.cpp
index 2b5ce2dd90..a2e53ff9a8 100644
--- a/tests/domain_wall_dslash_reference.cpp
+++ b/tests/domain_wall_dslash_reference.cpp
@@ -16,12 +16,6 @@
 
 using namespace quda;
 
-void xpay(void *x, double a, void *y, int length, QudaPrecision precision) {
-  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)x, a, (double*)y, length);
-  else xpay((float*)x, (float)a, (float*)y, length);
-}
-
-
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
 // 
@@ -839,17 +833,12 @@ void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit,
   if (precision == QUDA_DOUBLE_PRECISION) {
     if (zero_initialize) dslashReference_5th<QUDA_4D_PC,true>((double*)out, (double*)in, oddBit, daggerBit, mferm);
     else dslashReference_5th<QUDA_4D_PC,false>((double*)out, (double*)in, oddBit, daggerBit, mferm);
-    for(int xs = 0 ; xs < Ls ; xs++)
-    {
-      xpay((double*)in  + Vh*spinorSiteSize*xs, kappa[xs], (double*)out  + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-    }
   } else {
     if (zero_initialize) dslashReference_5th<QUDA_4D_PC,true>((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
     else dslashReference_5th<QUDA_4D_PC,false>((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
-    for(int xs = 0 ; xs < Ls ; xs++)
-    {
-      xpay((float*)in  + Vh*spinorSiteSize*xs, (float)(kappa[xs]), (float*)out  + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-    }
+  }
+  for(int xs = 0 ; xs < Ls ; xs++) {
+    xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa[xs], (char*)out + precision*Vh*spinorSiteSize*xs, Vh*spinorSiteSize, precision);
   }
 }
 
@@ -921,10 +910,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c
   mdw_dslash_5(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, kappa5, true);
 
   for(int xs = 0 ; xs < Ls ; xs++) {
-    if (precision == QUDA_DOUBLE_PRECISION)
-      xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-    else
-      xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
+    xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outOdd + precision*Vh*spinorSiteSize*xs,
+	 Vh*spinorSiteSize, precision);
   }
 
   mdw_dslash_4_pre(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, b5, c5, true);
@@ -932,10 +919,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c
   mdw_dslash_5(tmp, gauge, inEven, 0, dagger, precision, gauge_param, mferm, kappa5, true);
 
   for(int xs = 0 ; xs < Ls ; xs++) {
-    if (precision == QUDA_DOUBLE_PRECISION)
-      xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-    else
-      xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
+    xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outEven + precision*Vh*spinorSiteSize*xs,
+	 Vh*spinorSiteSize, precision);
   }
 
   free(kappa5);
@@ -1040,10 +1025,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa
     dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm);
     dslash_5_inv(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, kappa_mdwf);
     for(int xs = 0 ; xs < Ls ; xs++) {
-      if (precision == QUDA_DOUBLE_PRECISION)
-	xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-      else
-	xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
+      xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs,
+	   Vh*spinorSiteSize, precision);
     }
   } else if (symmetric && dagger) {
     dslash_5_inv(tmp, gauge, in, parity[1], dagger, precision, gauge_param, mferm, kappa_mdwf);
@@ -1053,10 +1036,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa
     dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm);
     mdw_dslash_4_pre(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true);
     for(int xs = 0 ; xs < Ls ; xs++) {
-      if (precision == QUDA_DOUBLE_PRECISION)
-	xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-      else
-	xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
+      xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs,
+	   Vh*spinorSiteSize, precision);
     }
   } else if (!symmetric && !dagger) {
     mdw_dslash_4_pre(out, gauge, in, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true);
@@ -1066,8 +1047,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa
     dslash_4_4d(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm);
     mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true);
     for(int xs = 0 ; xs < Ls ; xs++) {
-      if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp  + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-      else xpay((float*)tmp  + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
+      xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs,
+	   Vh*spinorSiteSize, precision);
     }
   } else if (!symmetric && dagger) {
     dslash_4_4d(out, gauge, in, parity[0], dagger, precision, gauge_param, mferm);
@@ -1076,11 +1057,10 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa
     dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm);
     mdw_dslash_4_pre(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, b5, c5, true);
     mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true);
-    for(int xs = 0 ; xs < Ls ; xs++)
-      {
-        if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp  + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-        else xpay((float*)tmp  + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
-      }
+    for(int xs = 0 ; xs < Ls ; xs++) {
+      xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs,
+	   Vh*spinorSiteSize, precision);
+    }
   } else {
     errorQuda("Unsupported matpc_type=%d dagger=%d", matpc_type, dagger);
   }
diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp
index a84957f955..63638913e9 100644
--- a/tests/dslash_test.cpp
+++ b/tests/dslash_test.cpp
@@ -69,12 +69,17 @@ extern QudaPrecision prec;
 extern QudaDagType dagger;
 QudaDagType not_dagger;
 
+extern bool compute_clover;
+extern double clover_coeff;
+
 extern bool verify_results;
 extern int niter;
 extern char latfile[];
 
 extern bool kernel_pack_t;
 
+QudaVerbosity verbosity = QUDA_VERBOSE;
+
 void init(int argc, char **argv) {
 
   cuda_prec = prec;
@@ -142,6 +147,7 @@ void init(int argc, char **argv) {
 
   inv_param.Ls = (inv_param.twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) ? Ls : 2;
   
+  inv_param.solve_type = (test_type == 2 || test_type == 4) ? QUDA_DIRECT_SOLVE : QUDA_DIRECT_PC_SOLVE;
   inv_param.matpc_type = matpc_type;
   inv_param.dagger = dagger;
   not_dagger = (QudaDagType)((dagger + 1)%2);
@@ -234,20 +240,11 @@ void init(int argc, char **argv) {
     inv_param.clover_cuda_prec = cuda_prec;
     inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
-    inv_param.clover_coeff = 1.5*inv_param.kappa;
-    //if (test_type > 0) {
-      hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
-      hostCloverInv = hostClover; // fake it
-      /*} else {
-      hostClover = NULL;
-      hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
-      }*/
-  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
-
+    inv_param.clover_coeff = clover_coeff;
+    hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
+    hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
   }
 
-  setVerbosity(QUDA_VERBOSE);
-
   // construct input fields
   for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
 
@@ -255,9 +252,6 @@ void init(int argc, char **argv) {
   
   csParam.nColor = 3;
   csParam.nSpin = 4;
-  if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-    csParam.twistFlavor = inv_param.twist_flavor;
-  }
   csParam.nDim = 4;
   for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
   if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ||
@@ -274,7 +268,7 @@ void init(int argc, char **argv) {
   }
 
 //ndeg_tm    
-  if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
+  if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
     csParam.twistFlavor = inv_param.twist_flavor;
     csParam.nDim = (inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) ? 4 : 5;
     csParam.x[4] = inv_param.Ls;    
@@ -284,15 +278,11 @@ void init(int argc, char **argv) {
   csParam.precision = inv_param.cpu_prec;
   csParam.pad = 0;
 
-  if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || 
-      dslash_type == QUDA_MOBIUS_DWF_DSLASH)
-  {
+  if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
     csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
     csParam.x[0] /= 2;
-
-  } else
-  {
-    if (test_type < 2 || test_type ==3) {
+  } else {
+    if (test_type < 2 || test_type == 3) {
       csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
       csParam.x[0] /= 2;
     } else {
@@ -321,33 +311,40 @@ void init(int argc, char **argv) {
     construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
   }
 
-  spinor->Source(QUDA_RANDOM_SOURCE);
+  spinor->Source(QUDA_RANDOM_SOURCE, 0);
 
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-    double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    double norm = 0.1; // clover components are random numbers in the range (-norm, norm)
     double diag = 1.0; // constant added to the diagonal
-
-    if (test_type == 2 || test_type == 4) {
-      construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
-    } else {
-      construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
-    }
+    construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
+    memcpy(hostCloverInv, hostClover, V*cloverSiteSize*inv_param.clover_cpu_prec);
   }
+
   printfQuda("done.\n"); fflush(stdout);
   
   initQuda(device);
 
+  if (tune) {
+    setTuning(QUDA_TUNE_YES);
+    printfQuda("Tuning...\n");
+  }
+
+  // set verbosity prior to loadGaugeQuda
+  setVerbosity(verbosity);
+  inv_param.verbosity = verbosity;
+
   printfQuda("Sending gauge field to GPU\n");
   loadGaugeQuda(hostGauge, &gauge_param);
 
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-    printfQuda("Sending clover field to GPU\n");
-    loadCloverQuda(hostClover, hostCloverInv, &inv_param);
-  }
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    if (compute_clover) printfQuda("Computing clover field on GPU\n");
+    else printfQuda("Sending clover field to GPU\n");
+    inv_param.compute_clover = compute_clover;
+    inv_param.return_clover = compute_clover;
+    inv_param.compute_clover_inverse = compute_clover;
+    inv_param.return_clover_inverse = compute_clover;
 
-  if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-    printfQuda("Sending clover field to GPU\n");
-    loadCloverQuda(NULL, NULL, &inv_param);
+    loadCloverQuda(hostClover, hostCloverInv, &inv_param);
   }
 
   if (!transfer) {
@@ -444,8 +441,8 @@ void end() {
   delete spinorTmp;
 
   for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
-  if((dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){
-    if (hostClover != hostCloverInv && hostClover) free(hostClover);
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    free(hostClover);
     free(hostCloverInv);
   }
   endQuda();
@@ -548,12 +545,16 @@ double dslashCUDA(int niter) {
     } else {
       switch (test_type) {
         case 0:
-          if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH && (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD)) {
+          if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
             if (transfer) {
               dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity);
             } else {
-	      ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2);
-              dirac->Dslash(*cudaSpinorOut, *tmp1, parity);
+	      if (dagger) {
+	        ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2);
+                dirac->Dslash(*cudaSpinorOut, *tmp1, parity);
+	      } else {
+                dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
+	      }
             }
           } else {
             if (transfer) {
@@ -618,13 +619,12 @@ void dslashRef() {
   printfQuda("Calculating reference implementation...");
   fflush(stdout);
 
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH ||
-      dslash_type == QUDA_WILSON_DSLASH) {
+  if (dslash_type == QUDA_WILSON_DSLASH) {
     switch (test_type) {
     case 0:
       wil_dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param);
       break;
-    case 1:    
+    case 1:
       wil_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.matpc_type, dagger, 
 		inv_param.cpu_prec, gauge_param);
       break;
@@ -645,7 +645,34 @@ void dslashRef() {
       printfQuda("Test type not defined\n");
       exit(-1);
     }
-  } else if((dslash_type == QUDA_TWISTED_MASS_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){ 
+  } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+    switch (test_type) {
+    case 0:
+      clover_dslash(spinorRef->V(), hostGauge, hostCloverInv, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param);
+      break;
+    case 1:
+      clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type,
+		   dagger, inv_param.cpu_prec, gauge_param);
+      break;
+    case 2:
+      clover_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param);
+      break;
+    case 3:
+      clover_matpc(spinorTmp->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type,
+		   dagger, inv_param.cpu_prec, gauge_param);
+      clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinorTmp->V(), inv_param.kappa, inv_param.matpc_type,
+		   not_dagger, inv_param.cpu_prec, gauge_param);
+      break;
+    case 4:
+      clover_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param);
+      clover_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, not_dagger,
+		 inv_param.cpu_prec, gauge_param);
+      break;
+    default:
+      printfQuda("Test type not defined\n");
+      exit(-1);
+    }
+  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
     switch (test_type) {
     case 0:
       if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)
@@ -697,7 +724,7 @@ void dslashRef() {
       }
       break;
     case 3:    
-      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){      
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { 
 	tm_matpc(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
 	       inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param);
 	tm_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
@@ -721,7 +748,7 @@ void dslashRef() {
       }
       break;
     case 4:
-      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){      
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) {      
 	tm_mat(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
 	     dagger, inv_param.cpu_prec, gauge_param);
 	tm_mat(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
@@ -748,6 +775,46 @@ void dslashRef() {
       printfQuda("Test type not defined\n");
       exit(-1);
     }
+  } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    switch (test_type) {
+    case 0:
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)
+	tmc_dslash(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param);
+      else
+        errorQuda("Not supported\n");
+      break;
+    case 1:
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)      
+	tmc_matpc(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param);
+      else
+        errorQuda("Not supported\n");
+      break;
+    case 2:
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)      
+	tmc_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param);
+      else
+        errorQuda("Not supported\n");
+      break;
+    case 3:    
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) {
+	tmc_matpc(spinorTmp->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+	       inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param);
+	tmc_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+	       inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
+      } else
+        errorQuda("Not supported\n");
+      break;
+    case 4:
+      if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) {
+	tmc_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param);
+	tmc_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param);
+      } else
+        errorQuda("Not supported\n");
+      break;
+    default:
+      printfQuda("Test type not defined\n");
+      exit(-1);
+    }
   } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ){
     switch (test_type) {
     case 0:
@@ -907,8 +974,6 @@ int main(int argc, char **argv)
   for (int i=0; i<attempts; i++) {
 
     if (tune) { // warm-up run
-      printfQuda("Tuning...\n");
-      setTuning(QUDA_TUNE_YES);
       dslashCUDA(1);
     }
     printfQuda("Executing %d kernel loops...\n", niter);
diff --git a/tests/dslash_util.h b/tests/dslash_util.h
index 821535beac..a848a8a559 100644
--- a/tests/dslash_util.h
+++ b/tests/dslash_util.h
@@ -22,12 +22,6 @@ static inline void ax(Float *dst, Float a, Float *x, int cnt) {
     dst[i] = a * x[i];
 }
 
-// performs the operation y[i] = x[i] + a*y[i]
-template <typename Float>
-static inline void xpay(Float *x, Float a, Float *y, int len) {
-  for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
-}
-
 // performs the operation y[i] = a*x[i] + y[i]
 template <typename Float>
 static inline void axpy(Float a, Float *x, Float *y, int len) {
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 9f239ba77e..2eabfe7587 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -57,6 +57,9 @@ extern double tol_hq; // heavy-quark tolerance for inverter
 extern QudaMassNormalization normalization; // mass normalization of Dirac operators
 extern QudaMatPCType matpc_type; // preconditioning type
 
+extern double clover_coeff;
+extern bool compute_clover;
+
 extern int niter; // max solver iterations
 extern char latfile[];
 
@@ -184,12 +187,14 @@ int main(int argc, char **argv)
   for (int i=0; i<inv_param.num_offset; i++) inv_param.offset[i] = offset[i];
 
   inv_param.inv_type = inv_type;
-  if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH ||
-      dslash_type == QUDA_DOMAIN_WALL_DSLASH  || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
-      dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
+  if (multishift) {
+    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
+  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH ||
+	     dslash_type == QUDA_DOMAIN_WALL_DSLASH  || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH ||
+	     dslash_type == QUDA_MOBIUS_DWF_DSLASH) {
     inv_param.solution_type = QUDA_MAT_SOLUTION;
   } else {
-    inv_param.solution_type = multishift ? QUDA_MATPCDAG_MATPC_SOLUTION : QUDA_MATPC_SOLUTION;
+    inv_param.solution_type = QUDA_MATPC_SOLUTION;
   }
   inv_param.matpc_type = matpc_type;
 
@@ -280,7 +285,7 @@ int main(int argc, char **argv)
     inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
     inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
-    inv_param.clover_coeff = 1.5*inv_param.kappa;
+    inv_param.clover_coeff = clover_coeff;
   }
 
   inv_param.verbosity = QUDA_VERBOSE;
@@ -302,7 +307,7 @@ int main(int argc, char **argv)
   size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
   size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
 
-  void *gauge[4], *clover_inv=0, *clover=0;
+  void *gauge[4], *clover=0, *clover_inv=0;
 
   for (int dir = 0; dir < 4; dir++) {
     gauge[dir] = malloc(V*gaugeSiteSize*gSize);
@@ -315,29 +320,19 @@ int main(int argc, char **argv)
     construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param);
   }
 
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-    double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    double norm = 0.1; // clover components are random numbers in the range (-norm, norm)
     double diag = 1.0; // constant added to the diagonal
 
-    size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
+    size_t cSize = inv_param.clover_cpu_prec;
+    clover = malloc(V*cloverSiteSize*cSize);
     clover_inv = malloc(V*cloverSiteSize*cSize);
-    construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);
-
-    // The uninverted clover term is only needed when solving the unpreconditioned
-    // system or when using "asymmetric" even/odd preconditioning.
-    int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ||
-			  inv_param.solve_type == QUDA_NORMOP_PC_SOLVE);
-    int asymmetric = preconditioned &&
-                         (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
-                          inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);
-    if (!preconditioned) {
-      clover = clover_inv;
-      clover_inv = NULL;
-    } else if (asymmetric) { // fake it by using the same random matrix
-      clover = clover_inv;   // for both clover and clover_inv
-    } else {
-      clover = NULL;
-    }
+    if (!compute_clover) construct_clover_field(clover, norm, diag, inv_param.clover_cpu_prec);
+
+    inv_param.compute_clover = compute_clover;
+    if (compute_clover) inv_param.return_clover = 1;
+    inv_param.compute_clover_inverse = 1;
+    inv_param.return_clover_inverse = 1;
   }
 
   void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
@@ -379,13 +374,14 @@ int main(int argc, char **argv)
   // initialize the QUDA library
   initQuda(device);
 
+  setTuning(QUDA_TUNE_YES);
+
   // load the gauge field
   loadGaugeQuda((void*)gauge, &gauge_param);
 
   // load the clover term, if desired
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);
-
-  if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param);
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH)
+    loadCloverQuda(clover, clover_inv, &inv_param);
 
   // perform the inversion
   if (multishift) {
@@ -417,18 +413,30 @@ int main(int argc, char **argv)
     for(int i=0; i < inv_param.num_offset; i++) {
       ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
       
-      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+      if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
 	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
 	  errorQuda("Twisted mass solution type not supported");
         tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
         tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                  inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
-      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+      } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
+	  errorQuda("Twisted mass solution type not supported");
+	tmc_matpc(spinorTmp, gauge, spinorOutMulti[i], clover, clover_inv, inv_param.kappa, inv_param.mu,
+		  inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
+        tmc_matpc(spinorCheck, gauge, spinorTmp, clover, clover_inv, inv_param.kappa, inv_param.mu,
+		  inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_WILSON_DSLASH) {
         wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0,
                   inv_param.cpu_prec, gauge_param);
         wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
                   inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_matpc(spinorTmp, gauge, clover, clover_inv, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0,
+		     inv_param.cpu_prec, gauge_param);
+        clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
+		     inv_param.cpu_prec, gauge_param);
       } else {
         printfQuda("Domain wall not supported for multi-shift\n");
         exit(-1);
@@ -450,22 +458,26 @@ int main(int argc, char **argv)
     
     if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
 
-      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-	if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS)      
+      if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
+	if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) {
 	  tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, inv_param.cpu_prec, gauge_param);
-	else
-	{
+	} else {
           int tm_offset = V*spinorSiteSize; //12*spinorRef->Volume(); 	  
 	  void *evenOut = spinorCheck;
 	  void *oddOut  = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset);
-    
+	  
 	  void *evenIn  = spinorOut;
 	  void *oddIn   = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset);
-    
+	  
 	  tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param);	
 	}
-      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+      } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+	tmc_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0,
+		inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_WILSON_DSLASH) {
         wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param);
       } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
         dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass);
       } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) {
@@ -499,14 +511,22 @@ int main(int argc, char **argv)
 
     } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) {
 
-      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+      if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
 	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
 	  errorQuda("Twisted mass solution type not supported");
         tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
-      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-        wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, 
+      } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
+	  errorQuda("Twisted mass solution type not supported");
+        tmc_matpc(spinorCheck, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu,
+		  inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_WILSON_DSLASH) {
+        wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0,
                   inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorOut, inv_param.kappa, inv_param.matpc_type, 0,
+		     inv_param.cpu_prec, gauge_param);
       } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
         dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass);
       } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) {
@@ -544,18 +564,30 @@ int main(int argc, char **argv)
 
       ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
       
-      if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+      if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
 	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
 	  errorQuda("Twisted mass solution type not supported");
         tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                  inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
         tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 
                  inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
-      } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+      } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+	if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS)
+	  errorQuda("Twisted mass solution type not supported");
+        tmc_matpc(spinorTmp, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu,
+		  inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
+        tmc_matpc(spinorCheck, gauge, spinorTmp, clover, clover_inv, inv_param.kappa, inv_param.mu,
+		  inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_WILSON_DSLASH) {
         wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0,
                   inv_param.cpu_prec, gauge_param);
         wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1,
                   inv_param.cpu_prec, gauge_param);
+      } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_matpc(spinorTmp, gauge, clover, clover_inv, spinorOut, inv_param.kappa,
+		     inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param);
+        clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorTmp, inv_param.kappa,
+		     inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param);
       } else {
         errorQuda("Unsupported dslash_type");
       }
@@ -587,5 +619,12 @@ int main(int argc, char **argv)
 
   finalizeComms();
 
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    if (clover) free(clover);
+    if (clover_inv) free(clover_inv);
+  }
+
+  for (int dir = 0; dir<4; dir++) free(gauge[dir]);
+
   return 0;
 }
diff --git a/tests/multigrid_invert_test.cpp b/tests/multigrid_invert_test.cpp
index a3b88d4c11..8b842ec10f 100644
--- a/tests/multigrid_invert_test.cpp
+++ b/tests/multigrid_invert_test.cpp
@@ -72,7 +72,8 @@ extern QudaTwistFlavorType twist_flavor;
 
 extern void usage(char** );
 
-double clover_coeff = 1.0;
+extern double clover_coeff;
+extern bool compute_clover;
 
 namespace quda {
   extern void setTransferGPU(bool);
@@ -171,6 +172,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) {
     inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
     inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
     inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
+    inv_param.clover_coeff = clover_coeff;
   }
 
   inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
@@ -195,8 +197,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) {
     }
   }
 
-  inv_param.clover_coeff = clover_coeff;
-
   inv_param.dagger = QUDA_DAG_NO;
   inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
 
@@ -415,7 +415,7 @@ int main(int argc, char **argv)
   size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
   size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
 
-  void *gauge[4], *clover_inv=0;//, *clover=0;
+  void *gauge[4], *clover=0, *clover_inv=0;
 
   for (int dir = 0; dir < 4; dir++) {
     gauge[dir] = malloc(V*gaugeSiteSize*gSize);
@@ -436,25 +436,15 @@ int main(int argc, char **argv)
     double norm = 0.1; // clover components are random numbers in the range (-norm, norm)
     double diag = 1.0; // constant added to the diagonal
 
-    size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
+    size_t cSize = inv_param.clover_cpu_prec;
+    clover = malloc(V*cloverSiteSize*cSize);
     clover_inv = malloc(V*cloverSiteSize*cSize);
-    construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);
-
-    // The uninverted clover term is only needed when solving the unpreconditioned
-    // system or when using "asymmetric" even/odd preconditioning.
-    int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ||
-			  inv_param.solve_type == QUDA_NORMOP_PC_SOLVE);
-    int asymmetric = preconditioned &&
-                         (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
-                          inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);
-    if (!preconditioned) {
-      //clover = clover_inv;
-      clover_inv = NULL;
-    } else if (asymmetric) { // fake it by using the same random matrix
-      //clover = clover_inv;   // for both clover and clover_inv
-    } else {
-      //clover = NULL;
-    }
+    if (!compute_clover) construct_clover_field(clover, norm, diag, inv_param.clover_cpu_prec);
+
+    inv_param.compute_clover = compute_clover;
+    if (compute_clover) inv_param.return_clover = 1;
+    inv_param.compute_clover_inverse = 1;
+    inv_param.return_clover_inverse = 1;
   }
 
   void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls);
@@ -472,12 +462,10 @@ int main(int argc, char **argv)
   // load the gauge field
   loadGaugeQuda((void*)gauge, &gauge_param);
 
-  // load the clover term, if desired
-  //if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);
-
   // this line ensure that if we need to construct the clover inverse (in either the smoother or the solver) we do so
   if (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE || solve_type == QUDA_DIRECT_PC_SOLVE) inv_param.solve_type = QUDA_DIRECT_PC_SOLVE;
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param);
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param);
+
   inv_param.solve_type = solve_type; // restore actual solve_type we want to do
 
   // setup the multigrid solver
@@ -577,11 +565,14 @@ int main(int argc, char **argv)
   endQuda();
 
   // finalize the communications layer
-#if defined(QMP_COMMS)
-  QMP_finalize_msg_passing();
-#elif defined(MPI_COMMS)
-  MPI_Finalize();
-#endif
+  finalizeComms();
+
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    if (clover) free(clover);
+    if (clover_inv) free(clover_inv);
+  }
+
+  for (int dir = 0; dir<4; dir++) free(gauge[dir]);
 
   return 0;
 }
diff --git a/tests/staggered_dslash_reference.cpp b/tests/staggered_dslash_reference.cpp
index 6932e07c7b..ea845cb339 100644
--- a/tests/staggered_dslash_reference.cpp
+++ b/tests/staggered_dslash_reference.cpp
@@ -12,6 +12,7 @@
 #include <blas_quda.h>
 
 #include <face_quda.h>
+#include <blas_reference.h>
 
 extern void *memset(void *s, int c, size_t n);
 
@@ -126,10 +127,7 @@ void Mat(sFloat *out, gFloat **fatlink, gFloat** longlink, sFloat *in, sFloat ka
   // full dslash operator
   dslashReference(outOdd, fatlink, longlink, inEven, 1, daggerBit);
   dslashReference(outEven, fatlink, longlink, inOdd, 0, daggerBit);
-    
-  // lastly apply the kappa term
-  xpay(in, -kappa, out, V*mySpinorSiteSize);
-}
+    }
 
 
 void 
@@ -150,6 +148,9 @@ mat(void *out, void **fatlink, void** longlink, void *in, double kappa, int dagg
       Mat((float*)out, (float**)fatlink, (float**)longlink, (float*)in, (float)kappa, dagger_bit);
     }
   }
+
+  // lastly apply the kappa term
+  xpay(in, -kappa, out, V*mySpinorSiteSize, sPrecision);
 }
 
 
@@ -220,27 +221,19 @@ matdagmat(void *out, void **fatlink, void** longlink, void *in, double mass, int
 
 // Apply the even-odd preconditioned Dirac operator
 template <typename sFloat, typename gFloat>
-static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, sFloat kappa, 
-		  int daggerBit, QudaMatPCType matpc_type) {
+static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, int dagger, QudaMatPCType matpc_type) {
     
   sFloat *tmp = (sFloat*)malloc(Vh*mySpinorSiteSize*sizeof(sFloat));
     
   // full dslash operator
   if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-    dslashReference(tmp, fatlink, longlink, inEven, 1, daggerBit);
-    dslashReference(outEven, fatlink, longlink, tmp, 0, daggerBit);
-
-    //dslashReference(outEven, fatlink, longlink, inEven, 1, daggerBit);
+    dslashReference(tmp, fatlink, longlink, inEven, 1, dagger);
+    dslashReference(outEven, fatlink, longlink, tmp, 0, dagger);
   } else {
-    dslashReference(tmp, fatlink, longlink, inEven, 0, daggerBit);
-    dslashReference(outEven, fatlink, longlink, tmp, 1, daggerBit);
+    dslashReference(tmp, fatlink, longlink, inEven, 0, dagger);
+    dslashReference(outEven, fatlink, longlink, tmp, 1, dagger);
   }    
   
-  // lastly apply the kappa term
-    
-  sFloat kappa2 = -kappa*kappa;
-  xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize);
-    
   free(tmp);
 }
 
@@ -252,18 +245,22 @@ staggered_matpc(void *outEven, void **fatlink, void**longlink, void *inEven, dou
     
   if (sPrecision == QUDA_DOUBLE_PRECISION)
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type);
+      MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type);
     }
     else{
-      MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type);
+      MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type);
     }
   else {
     if (gPrecision == QUDA_DOUBLE_PRECISION){ 
-      MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type);
+      MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, dagger_bit, matpc_type);
     }else{
-      MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type);
+      MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, dagger_bit, matpc_type);
     }
   }
+
+  // lastly apply the kappa term
+  double kappa2 = -kappa*kappa;
+  xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize, sPrecision);
 }
 
 #ifdef MULTI_GPU
diff --git a/tests/test_util.cpp b/tests/test_util.cpp
index 38c5e3cda5..44982bef69 100644
--- a/tests/test_util.cpp
+++ b/tests/test_util.cpp
@@ -1590,6 +1590,8 @@ bool verify_results = true;
 double mass = 0.1;
 double mu = 0.1;
 double anisotropy = 1.0;
+double clover_coeff = 0.1;
+bool compute_clover = false;
 double tol = 1e-7;
 double tol_hq = 0.;
 QudaTwistFlavorType twist_flavor = QUDA_TWIST_MINUS;
@@ -1656,6 +1658,8 @@ void usage(char** argv )
   printf("    --multishift <true/false>                 # Whether to do a multi-shift solver test or not (default false)\n");     
   printf("    --mass                                    # Mass of Dirac operator (default 0.1)\n");
   printf("    --mu                                      # Twisted-Mass of Dirac operator (default 0.1)\n");
+  printf("    --compute-clover                          # Compute the clover field or use random numbers (default false)\n");
+  printf("    --clover-coeff                            # Clover coefficient (default 1.0)\n");
   printf("    --anisotropy                              # Temporal anisotropy factor (default 1.0)\n");
   printf("    --mass-normalization                      # Mass normalization (kappa (default) / mass / asym-mass)\n");
   printf("    --matpc                                   # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n");
@@ -2150,6 +2154,34 @@ int process_command_line_option(int argc, char** argv, int* idx)
     goto out;
   }
 
+  if( strcmp(argv[i], "--compute-clover") == 0){
+    if (i+1 >= argc){
+      usage(argv);
+    }
+    if (strcmp(argv[i+1], "true") == 0){
+      compute_clover = true;
+    }else if (strcmp(argv[i+1], "false") == 0){
+      compute_clover = false;
+    }else{
+      fprintf(stderr, "ERROR: invalid compute_clover type\n");
+      exit(1);
+    }
+
+    i++;
+    ret = 0;
+    goto out;
+  }
+
+  if( strcmp(argv[i], "--clover-coeff") == 0){
+    if (i+1 >= argc){
+      usage(argv);
+    }
+    clover_coeff = atof(argv[i+1]);
+    i++;
+    ret = 0;
+    goto out;
+  }
+
   if( strcmp(argv[i], "--mu") == 0){
     if (i+1 >= argc){
       usage(argv);
diff --git a/tests/wilson_dslash_reference.cpp b/tests/wilson_dslash_reference.cpp
index 752411a7b5..8d9c3baa87 100644
--- a/tests/wilson_dslash_reference.cpp
+++ b/tests/wilson_dslash_reference.cpp
@@ -4,6 +4,7 @@
 
 #include <util_quda.h>
 
+
 #include <test_util.h>
 #include <blas_reference.h>
 #include <wilson_dslash_reference.h>
@@ -303,8 +304,7 @@ void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qu
   wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param);
 
   // lastly apply the kappa term
-  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V*spinorSiteSize);
-  else xpay((float*)in, -(float)kappa, (float*)out, V*spinorSiteSize);
+  xpay(in, -kappa, out, V*spinorSiteSize, precision);
 }
 
 void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, 
@@ -324,8 +324,7 @@ void tm_mat(void *out, void **gauge, void *in, double kappa, double mu,
   twist_gamma5(tmp, in, dagger_bit, kappa, mu, flavor, V, QUDA_TWIST_GAMMA5_DIRECT, precision);
 
   // combine
-  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V*spinorSiteSize);
-  else xpay((float*)tmp, -(float)kappa, (float*)out, V*spinorSiteSize);
+  xpay(tmp, -kappa, (double*)out, V*spinorSiteSize, precision);
 
   free(tmp);
 }
@@ -349,8 +348,7 @@ void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa,
   
   // lastly apply the kappa term
   double kappa2 = -kappa*kappa;
-  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize);
-  else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);
+  xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision);
 
   free(tmp);
 }
@@ -401,11 +399,9 @@ void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu
   // lastly apply the kappa term
   double kappa2 = -kappa*kappa;
   if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) {
-    if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize);
-    else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);
+    xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision);
   } else {
-    if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, kappa2, (double*)outEven, Vh*spinorSiteSize);
-    else xpay((float*)tmp, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);
+    xpay(tmp, kappa2, outEven, Vh*spinorSiteSize, precision);
   }
 
   free(tmp);
@@ -419,7 +415,7 @@ void ndegTwistGamma5(sFloat *out1, sFloat *out2, sFloat *in1, sFloat *in2, const
 
   sFloat a=0.0, b=0.0, d=0.0;
   if (twist == QUDA_TWIST_GAMMA5_DIRECT) { // applying the twist
-    a = 2.0 * kappa * mu; 
+    a = 2.0 * kappa * mu;
     b = -2.0 * kappa * epsilon;
     d = 1.0;
   } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { // applying the inverse twist
@@ -485,66 +481,60 @@ void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1,
   
   if (!daggerBit) {
     if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param);            
+      wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2,  tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);       
+      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
     } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
-      wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param);            
+      wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);       
+      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
     }
   } else {
     if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
       ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);
     } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
       ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);
     }
   }
   
   if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
-      wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param);            
+      wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2,  tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param);
   } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
-      wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param);      
-      wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param);            
+      wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param);
+      wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param);
       ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
-      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);      
-      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);      
+      wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param);
+      wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param);
   }  
   
   // lastly apply the kappa term
   double kappa2 = -kappa*kappa;
   if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
-    ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); 
+    ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
   }
 
-  if (precision == QUDA_DOUBLE_PRECISION){
-    xpay((double*)inEven1, kappa2, (double*)outEven1, Vh*spinorSiteSize);
-    xpay((double*)inEven2, kappa2, (double*)outEven2, Vh*spinorSiteSize);  
-  }
-  else{
-    xpay((float*)inEven1, (float)kappa2, (float*)outEven1, Vh*spinorSiteSize);
-    xpay((float*)inEven2, (float)kappa2, (float*)outEven2, Vh*spinorSiteSize);  
-  }
+  xpay(inEven1, kappa2, outEven1, Vh*spinorSiteSize, precision);
+  xpay(inEven2, kappa2, outEven2, Vh*spinorSiteSize, precision);
 
   free(tmp1);
   free(tmp2);
@@ -555,48 +545,39 @@ void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *
 {
   //V-4d volume and Vh=V/2
   void *inEven1   = evenIn;
-  void *inEven2   = (char*)evenIn + precision*Vh*spinorSiteSize;  
+  void *inEven2   = (char*)evenIn + precision*Vh*spinorSiteSize;
   
   void *inOdd1    = oddIn;
-  void *inOdd2    = (char*)oddIn + precision*Vh*spinorSiteSize;  
+  void *inOdd2    = (char*)oddIn + precision*Vh*spinorSiteSize;
   
   void *outEven1  = evenOut;
-  void *outEven2  = (char*)evenOut + precision*Vh*spinorSiteSize;  
+  void *outEven2  = (char*)evenOut + precision*Vh*spinorSiteSize;
   
   void *outOdd1   = oddOut;
-  void *outOdd2   = (char*)oddOut + precision*Vh*spinorSiteSize;  
+  void *outOdd2   = (char*)oddOut + precision*Vh*spinorSiteSize;
  
   void *tmpEven1 = malloc(Vh*spinorSiteSize*precision);
   void *tmpEven2 = malloc(Vh*spinorSiteSize*precision);
 
   void *tmpOdd1  = malloc(Vh*spinorSiteSize*precision);
-  void *tmpOdd2  = malloc(Vh*spinorSiteSize*precision);  
+  void *tmpOdd2  = malloc(Vh*spinorSiteSize*precision);
   
   // full dslash operator:
-  wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param);        
-  wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param);          
+  wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param);
+  wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param);
 
-  wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param);        
-  wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param);          
+  wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param);
+  wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param);
 
   // apply the twist term
-  ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);    
+  ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
   ndeg_twist_gamma5(tmpOdd1, tmpOdd2, inOdd1, inOdd2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
   // combine
-  if (precision == QUDA_DOUBLE_PRECISION){
-    xpay((double*)tmpOdd1, -kappa, (double*)outOdd1, Vh*spinorSiteSize);
-    xpay((double*)tmpOdd2, -kappa, (double*)outOdd2, Vh*spinorSiteSize);  
-   
-    xpay((double*)tmpEven1, -kappa, (double*)outEven1, Vh*spinorSiteSize);
-    xpay((double*)tmpEven2, -kappa, (double*)outEven2, Vh*spinorSiteSize);  
-  }
-  else{
-    xpay((float*)tmpOdd1, (float)(-kappa), (float*)outOdd1, Vh*spinorSiteSize);
-    xpay((float*)tmpOdd2, (float)(-kappa), (float*)outOdd2, Vh*spinorSiteSize);  
-    
-    xpay((float*)tmpEven1, (float)(-kappa), (float*)outEven1, Vh*spinorSiteSize);
-    xpay((float*)tmpEven2, (float)(-kappa), (float*)outEven2, Vh*spinorSiteSize);  
-  }
+  xpay(tmpOdd1, -kappa, outOdd1, Vh*spinorSiteSize, precision);
+  xpay(tmpOdd2, -kappa, outOdd2, Vh*spinorSiteSize, precision);
+
+  xpay(tmpEven1, -kappa, outEven1, Vh*spinorSiteSize, precision);
+  xpay(tmpEven2, -kappa, outEven2, Vh*spinorSiteSize, precision);
 
   free(tmpOdd1);
   free(tmpOdd2);
diff --git a/tests/wilson_dslash_reference.h b/tests/wilson_dslash_reference.h
index e066613f05..0dc929a299 100644
--- a/tests/wilson_dslash_reference.h
+++ b/tests/wilson_dslash_reference.h
@@ -10,7 +10,7 @@ extern "C" {
 
   void wil_dslash(void *res, void **gauge, void *spinorField, int oddBit,
 		  int daggerBit, QudaPrecision precision, QudaGaugeParam &param);
-  
+
   void wil_mat(void *out, void **gauge, void *in, double kappa, int daggerBit,
 	       QudaPrecision precision, QudaGaugeParam &param);
 
@@ -20,22 +20,43 @@ extern "C" {
   void tm_dslash(void *res, void **gauge, void *spinorField, double kappa,
 		 double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type,
 		 int daggerBit, QudaPrecision sprecision, QudaGaugeParam &param);
-  
+
   void tm_mat(void *out, void **gauge, void *in, double kappa, double mu,
 	      QudaTwistFlavorType flavor, int daggerBit, QudaPrecision precision, QudaGaugeParam &param);
 
   void tm_matpc(void *out, void **gauge, void *in, double kappa, double mu,
-		QudaTwistFlavorType flavor, QudaMatPCType matpc_type,  
+		QudaTwistFlavorType flavor, QudaMatPCType matpc_type,
 		int daggerBit, QudaPrecision precision, QudaGaugeParam &param);
 
-  void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, 
-		      double kappa, double mu,  double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, 
-		      QudaPrecision precision, QudaGaugeParam &gauge_param) ;
+  void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa,
+		 double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type,
+		 int daggerBit, QudaPrecision sprecision, QudaGaugeParam &param);
+
+  void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu,
+	       QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param);
+
+  void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor,
+                 QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param);
+
+  void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2,
+		      double kappa, double mu,  double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type,
+		      QudaPrecision precision, QudaGaugeParam &gauge_param);
   void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon,
-	   QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param);
-		      
-  void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn,  
-		   double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param);		      
+		     QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param);
+
+  void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn,
+		   double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param);
+
+  void apply_clover(void *out, void *clover, void *in, int parity, QudaPrecision precision);
+
+  void clover_dslash(void *res, void **gauge, void *clover, void *spinorField, int oddBit,
+		     int daggerBit, QudaPrecision precision, QudaGaugeParam &param);
+
+  void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa,
+		  int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param);
+
+  void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa,
+		    QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param);
 
 #ifdef __cplusplus
 }