lattice · mathiaswagner · Jun 24, 2016 · Jun 4, 2016 · Jun 4, 2016 · Jun 10, 2016
@@ -25,6 +25,16 @@ namespace quda {
       order = (precision == QUDA_DOUBLE_PRECISION) ? 
 	QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER;
     }
+
+    CloverFieldParam() :  LatticeFieldParam(),
+      direct(true), inverse(true), clover(nullptr), norm(nullptr),
+      cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { }
+
+    CloverFieldParam(const CloverFieldParam &param) :  LatticeFieldParam(param),
+      direct(param.direct), inverse(param.inverse),
+      clover(param.clover), norm(param.norm),
+      cloverInv(param.cloverInv), invNorm(param.invNorm),
+      twisted(param.twisted), mu2(param.mu2) { }
   };
 
   std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param);

@@ -144,7 +144,7 @@ namespace quda {
           complex<Float> tmp(a[parity][idx], a[parity][idx+1]);
           return tmp;
 	} else {
-	  // requesting upper triangular so return conjuate transpose
+	  // requesting upper triangular so return conjugate transpose
 	  return conj(operator()(parity,x,s_col,s_row,c_col,c_row) );
 	}
       }
@@ -252,65 +252,143 @@ namespace quda {
     template <typename Float, int length, int N>
       struct FloatNOrder {
 	typedef typename mapper<Float>::type RegType;
-	Float *clover[2];
-	float *norm[2];
+	typedef typename VectorType<Float,N>::type Vector;
+	static const int M=length/(N*2); // number of short vectors per chiral block
+	static const int block=length/2; // chiral block size
+
+	Float *clover;
+	float *norm;
+	size_t offset;
+	size_t norm_offset;
 	const int volumeCB;
 	const int stride;
 
 	const bool twisted;
 	const Float mu2;
-
-      FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()),
-	  twisted(clover.Twisted()), mu2(clover.Mu2()) {
-	this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse));
-	this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2);
-	this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse));
-	this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2);
-      }
+
+	size_t bytes;
+	size_t norm_bytes;
+	void *backup_h; //! host memory for backing up the field when tuning
+	void *backup_norm_h; //! host memory for backing up norm when tuning
+
+        FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) :
+	  offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))),
+	  volumeCB(clover.VolumeCB()), stride(clover.Stride()),
+	  twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()),
+	  norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr)
+	{
+	  this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse));
+	  this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse));
+	}
 
 	bool  Twisted()	const	{return twisted;}
 	Float Mu2()	const	{return mu2;}
 
-	__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
-	  const int M=length/(N*2);
-	  for (int chirality=0; chirality<2; chirality++) {
-	    for (int i=0; i<M; i++) {
-	      for (int j=0; j<N; j++) {
-		int intIdx = (chirality*M + i)*N + j; // internal dof index
-		int padIdx = intIdx / N;
-		copy(v[(chirality*M+i)*N+j], clover[parity][(padIdx*stride + x)*N + intIdx%N]);
-		if (sizeof(Float)==sizeof(short)) v[(chirality*M+i)*N+j] *= norm[parity][chirality*stride + x];
-	      }
-	    }
+	/**
+	   @brief Load accessor for a single chiral block
+	   @param[out] v Vector of loaded elements
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void load(RegType v[block], int x, int parity, int chirality) const {
+#pragma unroll
+	  for (int i=0; i<M; i++) {
+	    // first do vectorized copy from memory
+	    Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
+	    // second do scalar copy converting into register type
+#pragma unroll
+	    for (int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
 	  }
+
+	  if (sizeof(Float)==sizeof(short))
+#pragma unroll
+	    for (int i=0; i<block; i++) v[i] *= norm[parity*norm_offset + chirality*stride + x];
 	}
 
-	__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
+	/**
+	   @brief Store accessor for a single chiral block
+	   @param[out] v Vector of elements to be stored
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void save(const RegType v[block], int x, int parity, int chirality) {
 	  // find the norm of each chiral block
-	  RegType scale[2];
+	  RegType scale = 0.0;
 	  if (sizeof(Float)==sizeof(short)) {
-	    const int M = length/2;
-	    for (int chi=0; chi<2; chi++) { // chirality
-	      scale[chi] = 0.0;
-	      for (int i=0; i<M; i++) 
-		scale[chi] = fabs(v[chi*M+i]) > scale[chi] ? fabs(v[chi*M+i]) : scale[chi];
-	      norm[parity][chi*stride + x] = scale[chi];
-	    }
+#pragma unroll
+	    for (int i=0; i<block; i++) scale = fabs(v[i]) > scale ? fabs(v[i]) : scale;
+	    norm[parity*norm_offset + chirality*stride + x] = scale;
 	  }
 
-	  const int M=length/(N*2);
-	  for (int chirality=0; chirality<2; chirality++) {
-	    for (int i=0; i<M; i++) {
-	      for (int j=0; j<N; j++) {
-		int intIdx = (chirality*M + i)*N + j;
-		int padIdx = intIdx / N;
-		if (sizeof(Float)==sizeof(short))
-		  copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j] / scale[chirality]);
-		else
-		  copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j]);
-	      }
-	    }
+#pragma unroll
+	  for (int i=0; i<M; i++) {
+	    Vector vecTmp;
+	    // first do scalar copy converting into storage type and rescaling if necessary
+	    if (sizeof(Float)==sizeof(short))
+#pragma unroll
+	      for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j] / scale);
+	    else
+#pragma unroll
+	      for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j]);
+
+	    // second do vectorized copy into memory
+	    reinterpret_cast<Vector*>(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp;
+	  }
+	}
+
+	/**
+	   @brief Load accessor for the clover matrix
+	   @param[out] v Vector of loaded elements
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
+#pragma unroll
+	  for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality);
+	}
+
+	/**
+	   @brief Store accessor for the clover matrix
+	   @param[out] v Vector of elements to be stored
+	   @param[in] x Checkerboarded site index
+	   @param[in] parity Field parity
+	   @param[in] chirality Chiral block index
+	 */
+	__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
+#pragma unroll
+	  for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality);
+	}
+
+	/**
+	   @brief Backup the field to the host when tuning
+	*/
+	void save() {
+	  if (backup_h) errorQuda("Already allocated host backup");
+	  backup_h = safe_malloc(bytes);
+	  cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
+	  if (norm_bytes) {
+	    backup_norm_h = safe_malloc(norm_bytes);
+	    cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
+	  }
+	  checkCudaError();
+	}
+
+	/**
+	   @brief Restore the field from the host after tuning
+	*/
+	void load() {
+	  cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
+	  host_free(backup_h);
+	  backup_h = nullptr;
+	  if (norm_bytes) {
+	    cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
+	    host_free(backup_norm_h);
+	    backup_norm_h = nullptr;
 	  }
+	  checkCudaError();
 	}
 
 	size_t Bytes() const {
@@ -320,6 +398,7 @@ namespace quda {
 	}
       };
 
+
     /**
        QDP ordering for clover fields
     */

@@ -33,7 +33,6 @@ namespace quda {
     cudaGaugeField *fatGauge;  // used by staggered only
     cudaGaugeField *longGauge; // used by staggered only
     cudaCloverField *clover;
-    cudaCloverField *cloverInv;
 
     double mu; // used by twisted mass only
     double epsilon; //2nd tm parameter (used by twisted mass only)
@@ -49,7 +48,7 @@ namespace quda {
 
   DiracParam() 
     : type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID),
-      dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0),
+      dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0),
       tmp1(0), tmp2(0)
     {
 
@@ -537,7 +536,6 @@ namespace quda {
     double mu;
     double epsilon;
     cudaCloverField &clover;
-    cudaCloverField &cloverInv;
     void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const;
     void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in, 
           const QudaTwistGamma5Type twistType, const int parity) const;

@@ -329,7 +329,6 @@ namespace quda {
      @param T[in] Transfer operator that defines the coarse space
      @param gauge[in] Gauge field from fine grid
      @param clover[in] Clover field on fine grid (optional)
-     @param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover)
      @param kappa[in] Kappa parameter
      @param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover)
      @param matpc[in] The type of even-odd preconditioned fine-grid
@@ -338,7 +337,7 @@ namespace quda {
      even-odd preconditioned and we coarsen the full operator.
    */
   void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T,
-		const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv,
+		const cudaGaugeField &gauge, const cudaCloverField *clover,
 		double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc);
 
   /**

@@ -183,6 +183,11 @@ extern "C" {
     int compute_clover_trlog;              /**< Whether to compute the trace log of the clover term */
     double trlogA[2];                      /**< The trace log of the clover term (even/odd computed separately) */
 
+    int compute_clover;                    /**< Whether to compute the clover field */
+    int compute_clover_inverse;            /**< Whether to compute the clover inverse field */
+    int return_clover;                     /**< Whether to copy back the clover matrix field */
+    int return_clover_inverse;             /**< Whether to copy back the inverted clover matrix field */
+
     QudaVerbosity verbosity;               /**< The verbosity setting to use in the solver */
 
     int sp_pad;                            /**< The padding to use for the fermion fields */

@@ -355,9 +355,18 @@ void printQudaInvertParam(QudaInvertParam *param) {
 #if defined INIT_PARAM
     P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION);
     P(compute_clover_trlog, 0);
+    P(compute_clover, 0);
+    P(compute_clover_inverse, 0);
+    P(return_clover, 0);
+    P(return_clover_inverse, 0);
 #else
-  if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
-    param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
+    if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
+      param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
+    P(compute_clover_trlog, QUDA_INVALID_PRECISION);
+    P(compute_clover, QUDA_INVALID_PRECISION);
+    P(compute_clover_inverse, QUDA_INVALID_PRECISION);
+    P(return_clover, QUDA_INVALID_PRECISION);
+    P(return_clover_inverse, QUDA_INVALID_PRECISION);
 #endif
     P(clover_order, QUDA_INVALID_CLOVER_ORDER);
     P(cl_pad, INVALID_INT);