diff --git a/include/clover_field.h b/include/clover_field.h index 6f16c99154..1c1f2067f4 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -25,6 +25,16 @@ namespace quda { order = (precision == QUDA_DOUBLE_PRECISION) ? QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER; } + + CloverFieldParam() : LatticeFieldParam(), + direct(true), inverse(true), clover(nullptr), norm(nullptr), + cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { } + + CloverFieldParam(const CloverFieldParam ¶m) : LatticeFieldParam(param), + direct(param.direct), inverse(param.inverse), + clover(param.clover), norm(param.norm), + cloverInv(param.cloverInv), invNorm(param.invNorm), + twisted(param.twisted), mu2(param.mu2) { } }; std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param); diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 73a3112398..ceb08ad5c2 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -144,7 +144,7 @@ namespace quda { complex tmp(a[parity][idx], a[parity][idx+1]); return tmp; } else { - // requesting upper triangular so return conjuate transpose + // requesting upper triangular so return conjugate transpose return conj(operator()(parity,x,s_col,s_row,c_col,c_row) ); } } @@ -252,65 +252,143 @@ namespace quda { template struct FloatNOrder { typedef typename mapper::type RegType; - Float *clover[2]; - float *norm[2]; + typedef typename VectorType::type Vector; + static const int M=length/(N*2); // number of short vectors per chiral block + static const int block=length/2; // chiral block size + + Float *clover; + float *norm; + size_t offset; + size_t norm_offset; const int volumeCB; const int stride; const bool twisted; const Float mu2; - - FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()), - twisted(clover.Twisted()), mu2(clover.Mu2()) { - this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse)); - this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2); - this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse)); - this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2); - } + + size_t bytes; + size_t norm_bytes; + void *backup_h; //! host memory for backing up the field when tuning + void *backup_norm_h; //! host memory for backing up norm when tuning + + FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) : + offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))), + volumeCB(clover.VolumeCB()), stride(clover.Stride()), + twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()), + norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr) + { + this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse)); + this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse)); + } bool Twisted() const {return twisted;} Float Mu2() const {return mu2;} - __device__ __host__ inline void load(RegType v[length], int x, int parity) const { - const int M=length/(N*2); - for (int chirality=0; chirality<2; chirality++) { - for (int i=0; i(clover + parity*offset, x + stride*(chirality*M+i)); + // second do scalar copy converting into register type +#pragma unroll + for (int j=0; j(&vecTmp)[j]); } + + if (sizeof(Float)==sizeof(short)) +#pragma unroll + for (int i=0; i scale[chi] ? fabs(v[chi*M+i]) : scale[chi]; - norm[parity][chi*stride + x] = scale[chi]; - } +#pragma unroll + for (int i=0; i scale ? fabs(v[i]) : scale; + norm[parity*norm_offset + chirality*stride + x] = scale; } - const int M=length/(N*2); - for (int chirality=0; chirality<2; chirality++) { - for (int i=0; i(&vecTmp)[j], v[i*N+j] / scale); + else +#pragma unroll + for (int j=0; j(&vecTmp)[j], v[i*N+j]); + + // second do vectorized copy into memory + reinterpret_cast(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp; + } + } + + /** + @brief Load accessor for the clover matrix + @param[out] v Vector of loaded elements + @param[in] x Checkerboarded site index + @param[in] parity Field parity + @param[in] chirality Chiral block index + */ + __device__ __host__ inline void load(RegType v[length], int x, int parity) const { +#pragma unroll + for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality); + } + + /** + @brief Store accessor for the clover matrix + @param[out] v Vector of elements to be stored + @param[in] x Checkerboarded site index + @param[in] parity Field parity + @param[in] chirality Chiral block index + */ + __device__ __host__ inline void save(const RegType v[length], int x, int parity) { +#pragma unroll + for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality); + } + + /** + @brief Backup the field to the host when tuning + */ + void save() { + if (backup_h) errorQuda("Already allocated host backup"); + backup_h = safe_malloc(bytes); + cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost); + if (norm_bytes) { + backup_norm_h = safe_malloc(norm_bytes); + cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost); + } + checkCudaError(); + } + + /** + @brief Restore the field from the host after tuning + */ + void load() { + cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice); + host_free(backup_h); + backup_h = nullptr; + if (norm_bytes) { + cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice); + host_free(backup_norm_h); + backup_norm_h = nullptr; } + checkCudaError(); } size_t Bytes() const { @@ -320,6 +398,7 @@ namespace quda { } }; + /** QDP ordering for clover fields */ diff --git a/include/dirac_quda.h b/include/dirac_quda.h index c62502b8ae..9488f4ca96 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -33,7 +33,6 @@ namespace quda { cudaGaugeField *fatGauge; // used by staggered only cudaGaugeField *longGauge; // used by staggered only cudaCloverField *clover; - cudaCloverField *cloverInv; double mu; // used by twisted mass only double epsilon; //2nd tm parameter (used by twisted mass only) @@ -49,7 +48,7 @@ namespace quda { DiracParam() : type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID), - dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0), + dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0), tmp1(0), tmp2(0) { @@ -537,7 +536,6 @@ namespace quda { double mu; double epsilon; cudaCloverField &clover; - cudaCloverField &cloverInv; void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const; void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in, const QudaTwistGamma5Type twistType, const int parity) const; diff --git a/include/multigrid.h b/include/multigrid.h index 682442e6e8..a8e1eb95d6 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -329,7 +329,6 @@ namespace quda { @param T[in] Transfer operator that defines the coarse space @param gauge[in] Gauge field from fine grid @param clover[in] Clover field on fine grid (optional) - @param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover) @param kappa[in] Kappa parameter @param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover) @param matpc[in] The type of even-odd preconditioned fine-grid @@ -338,7 +337,7 @@ namespace quda { even-odd preconditioned and we coarsen the full operator. */ void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T, - const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv, + const cudaGaugeField &gauge, const cudaCloverField *clover, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc); /** diff --git a/include/quda.h b/include/quda.h index 7b22576d35..1b38429d14 100644 --- a/include/quda.h +++ b/include/quda.h @@ -183,6 +183,11 @@ extern "C" { int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ + int compute_clover; /**< Whether to compute the clover field */ + int compute_clover_inverse; /**< Whether to compute the clover inverse field */ + int return_clover; /**< Whether to copy back the clover matrix field */ + int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ + QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ int sp_pad; /**< The padding to use for the fermion fields */ diff --git a/lib/check_params.h b/lib/check_params.h index a10b3d63cb..d04220a5a2 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -355,9 +355,18 @@ void printQudaInvertParam(QudaInvertParam *param) { #if defined INIT_PARAM P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION); P(compute_clover_trlog, 0); + P(compute_clover, 0); + P(compute_clover_inverse, 0); + P(return_clover, 0); + P(return_clover_inverse, 0); #else - if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION) - param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy; + if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION) + param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy; + P(compute_clover_trlog, QUDA_INVALID_PRECISION); + P(compute_clover, QUDA_INVALID_PRECISION); + P(compute_clover_inverse, QUDA_INVALID_PRECISION); + P(return_clover, QUDA_INVALID_PRECISION); + P(return_clover_inverse, QUDA_INVALID_PRECISION); #endif P(clover_order, QUDA_INVALID_CLOVER_ORDER); P(cl_pad, INVALID_INT); diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index 2764a84e2f..0142c493af 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace quda { @@ -13,35 +13,31 @@ namespace quda { #ifdef GPU_CLOVER_DIRAC template - struct CloverInvertArg { + struct CloverInvertArg : public ReduceArg { const Clover clover; Clover inverse; bool computeTraceLog; - double * const trlogA_h; - double *trlogA_d; //extra attributes for twisted mass clover bool twist; double mu2; - CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0, double* const trlogA=0) : - inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), trlogA_h(trlogA), twist(clover.Twisted()), mu2(clover.Mu2()){ - cudaHostGetDevicePointer(&trlogA_d, trlogA_h, 0); // set the matching device pointer - } + CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0) : + ReduceArg(), inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), + twist(clover.Twisted()), mu2(clover.Mu2()) { } }; /** Use a Cholesky decomposition to invert the clover matrix Here we use an inplace inversion which hopefully reduces register pressure */ - template - __device__ __host__ double cloverInvertCompute(CloverInvertArg arg, int x, int parity) { - - Float A[72]; - double trlogA = 0.0; + template + __device__ __host__ inline double cloverInvertCompute(CloverInvertArg arg, int x, int parity) { - // load the clover term into memory - arg.clover.load(A, x, parity); + double trlogA = 0.0; for (int ch=0; ch<2; ch++) { + Float A[36]; + // load the clover term into memory + arg.clover.load(A, x, parity, ch); Float diag[6]; Float tmp[6]; // temporary storage @@ -50,51 +46,36 @@ namespace quda { // hack into the right order as MILC just to copy algorithm directly // FIXME use native ordering in the Cholseky // factor of two is inherent to QUDA clover storage - for (int i=0; i<6; i++) diag[i] = 2.0*A[ch*36+i]; + constexpr Float two = static_cast(2.0); + for (int i=0; i<6; i++) diag[i] = two*A[ch*36+i]; const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; - for (int i=0; i<15; i++) tri[idtab[i]] = complex(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]); +#pragma unroll + for (int i=0; i<15; i++) tri[idtab[i]] = complex(two*A[ch*36+6+2*i], two*A[ch*36+6+2*i+1]); -//Compute (T^2 + mu2) first, then invert (not optimized!): - if(arg.twist) - { + //Compute (T^2 + mu2) first, then invert (not optimized!): + if (twist) { complex aux[15];//hmmm, better to reuse A-regs... //another solution just to define (but compiler may not be happy with this, swapping everything in //the global buffer): //complex* aux = (complex*)&A[ch*36]; //compute off-diagonal terms: -// aux[ 0] = tri[0]*diag[0]+diag[1]*tri[0]+conj(tri[2])*tri[1]+conj(tri[4])*tri[3]+conj(tri[7])*tri[6]+conj(tri[11])*tri[10]; -// aux[ 1] = tri[1]*diag[0]+diag[2]*tri[1]+tri[2]*tri[0]+conj(tri[5])*tri[3]+conj(tri[8])*tri[6]+conj(tri[12])*tri[10]; - aux[ 2] = tri[2]*diag[1]+diag[2]*tri[2]+tri[1]*conj(tri[0])+conj(tri[5])*tri[4]+conj(tri[8])*tri[7]+conj(tri[12])*tri[11]; -// aux[ 3] = tri[3]*diag[0]+diag[3]*tri[3]+tri[4]*tri[0]+tri[5]*tri[1]+conj(tri[9])*tri[6]+conj(tri[13])*tri[10]; - aux[ 4] = tri[4]*diag[1]+diag[3]*tri[4]+tri[3]*conj(tri[0])+tri[5]*tri[2]+conj(tri[9])*tri[7]+conj(tri[13])*tri[11]; - aux[ 5] = tri[5]*diag[2]+diag[3]*tri[5]+tri[3]*conj(tri[1])+tri[4]*conj(tri[2])+conj(tri[9])*tri[8]+conj(tri[13])*tri[12]; -// aux[ 6] = tri[6]*diag[0]+diag[4]*tri[6]+tri[7]*tri[0]+tri[8]*tri[1]+tri[9]*tri[3]+conj(tri[14])*tri[10]; - aux[ 7] = tri[7]*diag[1]+diag[4]*tri[7]+tri[6]*conj(tri[0])+tri[8]*tri[2]+tri[9]*tri[4]+conj(tri[14])*tri[11]; - aux[ 8] = tri[8]*diag[2]+diag[4]*tri[8]+tri[6]*conj(tri[1])+tri[7]*conj(tri[2])+tri[9]*tri[5]+conj(tri[14])*tri[12]; - aux[ 9] = tri[9]*diag[3]+diag[4]*tri[9]+tri[6]*conj(tri[3])+tri[7]*conj(tri[4])+tri[8]*conj(tri[5])+conj(tri[14])*tri[13]; -// aux[10] = tri[10]*diag[0]+diag[5]*tri[10]+tri[11]*tri[0]+tri[12]*tri[1]+tri[13]*tri[3]+tri[14]*tri[6]; - aux[11] = tri[11]*diag[1]+diag[5]*tri[11]+tri[10]*conj(tri[0])+tri[12]*tri[2]+tri[13]*tri[4]+tri[14]*tri[7]; - aux[12] = tri[12]*diag[2]+diag[5]*tri[12]+tri[10]*conj(tri[1])+tri[11]*conj(tri[2])+tri[13]*tri[5]+tri[14]*tri[8]; - aux[13] = tri[13]*diag[3]+diag[5]*tri[13]+tri[10]*conj(tri[3])+tri[11]*conj(tri[4])+tri[12]*conj(tri[5])+tri[14]*tri[9]; - aux[14] = tri[14]*diag[4]+diag[5]*tri[14]+tri[10]*conj(tri[6])+tri[11]*conj(tri[7])+tri[12]*conj(tri[8])+tri[13]*conj(tri[9]); - //update diagonal elements: diag[0] = (Float)arg.mu2+diag[0]*diag[0]+norm(tri[ 0])+norm(tri[ 1])+norm(tri[ 3])+norm(tri[ 6])+norm(tri[10]); diag[1] = (Float)arg.mu2+diag[1]*diag[1]+norm(tri[ 0])+norm(tri[ 2])+norm(tri[ 4])+norm(tri[ 7])+norm(tri[11]); @@ -103,10 +84,10 @@ namespace quda { diag[4] = (Float)arg.mu2+diag[4]*diag[4]+norm(tri[ 6])+norm(tri[ 7])+norm(tri[ 8])+norm(tri[ 9])+norm(tri[14]); diag[5] = (Float)arg.mu2+diag[5]*diag[5]+norm(tri[10])+norm(tri[11])+norm(tri[12])+norm(tri[13])+norm(tri[14]); - //update off-diagonal elements: + //update off-diagonal elements: for(int i = 0; i < 15; i++) tri[i] = aux[i]; } -// + for (int j=0; j<6; j++) { diag[j] = sqrt(diag[j]); tmp[j] = 1.0 / diag[j]; @@ -128,7 +109,7 @@ namespace quda { } /* Accumulate trlogA */ - for (int j=0;j<6;j++) trlogA += (double)2.0*log((double)(diag[j])); + if (computeTrLog) for (int j=0;j<6;j++) trlogA += 2.0*log((double)(diag[j])); /* Now use forward and backward substitution to construct inverse */ complex v1[6]; @@ -165,79 +146,94 @@ namespace quda { } } - for (int i=0; i<6; i++) A[ch*36+i] = 0.5 * diag[i]; - for (int i=0; i<15; i++) { - A[ch*36+6+2*i] = 0.5*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = 0.5*tri[idtab[i]].imag(); - } - } + constexpr Float half = static_cast(0.5); + for (int i=0; i<6; i++) A[ch*36+i] = half * diag[i]; +#pragma unroll + for (int i=0; i<15; i++) { A[ch*36+6+2*i] = half*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = half*tri[idtab[i]].imag(); } - // save the inverted matrix - arg.inverse.save(A, x, parity); + // save the inverted matrix + arg.inverse.save(A, x, parity, ch); + } return trlogA; } - template + template void cloverInvert(CloverInvertArg arg) { for (int parity=0; parity<2; parity++) { for (int x=0; x(arg, x, parity); - if (arg.computeTraceLog) arg.trlogA_h[parity] += trlogA; + double trlogA = cloverInvertCompute(arg, x, parity); + if (computeTrLog) { + if (parity) arg.result_h[0].y += trlogA; + else arg.result_h[0].x += trlogA; + } } } } - template + template + __launch_bounds__(2*blockSize) __global__ void cloverInvertKernel(CloverInvertArg arg) { int idx = blockIdx.x*blockDim.x + threadIdx.x; - //if (idx >= arg.clover.volumeCB) return; - int parity = blockIdx.y; - double trlogA = 0.0; - if (idx < arg.clover.volumeCB) trlogA = cloverInvertCompute(arg, idx, parity); - - if (arg.computeTraceLog) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - double aggregate = BlockReduce(temp_storage).Sum(trlogA); - if (threadIdx.x == 0) atomicAdd(arg.trlogA_d+parity, aggregate); - } + int parity = threadIdx.y; + double trlogA_parity = 0.0; + if (idx < arg.clover.volumeCB) + trlogA_parity = cloverInvertCompute(arg, idx, parity); + double2 trlogA = parity ? make_double2(0.0,trlogA_parity) : make_double2(trlogA_parity, 0.0); + if (computeTrLog) reduce2d(arg, trlogA); } template - class CloverInvert : Tunable { + class CloverInvert : TunableLocalParity { CloverInvertArg arg; const CloverField &meta; // used for meta data only const QudaFieldLocation location; private: - unsigned int sharedBytesPerThread() const { return 0; } - unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0 ;} - bool tuneSharedBytes() const { return false; } // Don't tune the shared memory - bool tuneGridDim() const { return false; } // Don't tune the grid dimensions. unsigned int minThreads() const { return arg.clover.volumeCB; } public: CloverInvert(CloverInvertArg &arg, const CloverField &meta, QudaFieldLocation location) : arg(arg), meta(meta), location(location) { - writeAuxString("stride=%d,prec=%lu",arg.clover.stride,sizeof(Float)); + writeAuxString("stride=%d,prec=%lu,trlog=%s,twist=%s", arg.clover.stride, sizeof(Float), + arg.computeTraceLog ? "true" : "false", arg.twist ? "true" : "false"); } virtual ~CloverInvert() { ; } void apply(const cudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - arg.trlogA_h[0] = 0.0; arg.trlogA_h[1] = 0.0; + arg.result_h[0] = make_double2(0.,0.); if (location == QUDA_CUDA_FIELD_LOCATION) { - tp.grid.y = 2; // for parity - LAUNCH_KERNEL(cloverInvertKernel, tp, stream, arg, Float, Clover); + if (arg.computeTraceLog) { + if (arg.twist) { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, true); + } else { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, false); + } + } else { + if (arg.twist) { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, true); + } else { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, false); + } + } } else { - cloverInvert<1, Float, Clover>(arg); - } - if (arg.computeTraceLog) { - cudaDeviceSynchronize(); - reduceDoubleArray(arg.trlogA_h, 2); + if (arg.computeTraceLog) { + if (arg.twist) { + cloverInvert<1, Float, Clover, true, true>(arg); + } else { + cloverInvert<1, Float, Clover, true, false>(arg); + } + } else { + if (arg.twist) { + cloverInvert<1, Float, Clover, false, true>(arg); + } else { + cloverInvert<1, Float, Clover, false, false>(arg); + } + } } } @@ -254,15 +250,25 @@ namespace quda { long long flops() const { return 0; } long long bytes() const { return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); } + + void preTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.save(); } + void postTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.load(); } + }; template void cloverInvert(Clover inverse, const Clover clover, bool computeTraceLog, double* const trlog, const CloverField &meta, QudaFieldLocation location) { - CloverInvertArg arg(inverse, clover, computeTraceLog, trlog); + CloverInvertArg arg(inverse, clover, computeTraceLog); CloverInvert invert(arg, meta, location); invert.apply(0); - cudaDeviceSynchronize(); + + if (arg.computeTraceLog) { + cudaDeviceSynchronize(); + comm_allreduce_array((double*)arg.result_h, 2); + trlog[0] = arg.result_h[0].x; + trlog[1] = arg.result_h[0].y; + } } template diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu index 17fb161dec..4814d057f1 100644 --- a/lib/clover_quda.cu +++ b/lib/clover_quda.cu @@ -95,7 +95,6 @@ namespace quda { const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; Float diag[6]; Complex triangle[15]; - Float A[72]; // This uses lots of unnecessary memory for(int ch=0; ch<2; ++ch){ @@ -129,18 +128,17 @@ namespace quda { triangle[14] = block1[ch](2,1); - for(int i=0; i<6; ++i){ - A[ch*36 + i] = 0.5*diag[i]; - } + Float A[36]; + for(int i=0; i<6; ++i) A[i] = static_cast(0.5)*diag[i]; + for(int i=0; i<15; ++i){ - A[ch*36+6+2*i] = 0.5*triangle[idtab[i]].x; - A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y; + A[6+2*i] = 0.5*triangle[idtab[i]].x; + A[6+2*i + 1] = 0.5*triangle[idtab[i]].y; } + arg.clover.save(A, idx, parity, ch); } // ch // 84 floating-point ops - - arg.clover.save(A, idx, parity); return; } diff --git a/lib/coarse_op.cu b/lib/coarse_op.cu index 1dc9c3f0ed..5a0e28b24d 100644 --- a/lib/coarse_op.cu +++ b/lib/coarse_op.cu @@ -15,7 +15,7 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { typedef typename colorspinor::FieldOrderCB F; typedef typename gauge::FieldOrder gFine; @@ -34,7 +34,7 @@ namespace quda { gCoarse xAccessor(const_cast(X)); gCoarse xInvAccessor(const_cast(Xinv)); cFine cAccessor(const_cast(c), false); - cFine cInvAccessor(const_cast(cI), true); + cFine cInvAccessor(const_cast(c), true); calculateY (yAccessor, xAccessor, xInvAccessor, uvAccessor, avAccessor, vAccessor, gAccessor, cAccessor, cInvAccessor, Y, X, Xinv, Yhat, av, v, kappa, mu, dirac, matpc); @@ -44,28 +44,28 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (T.Vectors().Nspin()/T.Spin_bs() != 2) errorQuda("Unsupported number of coarse spins %d\n",T.Vectors().Nspin()/T.Spin_bs()); const int coarseSpin = 2; const int coarseColor = Y.Ncolor() / coarseSpin; if (coarseColor == 2) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 4) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 8) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 12) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 16) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 20) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 24) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 32) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor()); } @@ -74,9 +74,9 @@ namespace quda { // template on fine spin template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (uv.Nspin() == 4) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of spins %d\n", uv.Nspin()); } @@ -85,9 +85,9 @@ namespace quda { // template on fine colors template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (g.Ncolor() == 3) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of colors %d\n", g.Ncolor()); } @@ -95,10 +95,10 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { //If c == NULL, then this is standard Wilson. csOrder is dummy and will not matter if (c.Order() == QUDA_PACKED_CLOVER_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", c.Order()); } @@ -106,9 +106,9 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (g.FieldOrder() == QUDA_QDP_GAUGE_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", g.FieldOrder()); } @@ -116,9 +116,9 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (T.Vectors().FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", T.Vectors().FieldOrder()); } @@ -126,7 +126,7 @@ namespace quda { //Does the heavy lifting of creating the coarse color matrices Y void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (X.Precision() != Y.Precision() || Y.Precision() != uv.Precision() || Y.Precision() != T.Vectors().Precision() || Y.Precision() != g.Precision()) errorQuda("Unsupported precision mix"); @@ -135,12 +135,12 @@ namespace quda { if (Y.Precision() == QUDA_DOUBLE_PRECISION) { #ifdef GPU_MULTIGRID_DOUBLE - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); #else errorQuda("Double precision multigrid has not been enabled"); #endif } else if (Y.Precision() == QUDA_SINGLE_PRECISION) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported precision %d\n", Y.Precision()); } @@ -150,7 +150,7 @@ namespace quda { //Calculates the coarse color matrix and puts the result in Y. //N.B. Assumes Y, X have been allocated. void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T, - const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv, + const cudaGaugeField &gauge, const cudaCloverField *clover, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { QudaPrecision precision = Y.Precision(); //First make a cpu gauge field from the cuda gauge field @@ -203,19 +203,9 @@ namespace quda { cf_param.create = QUDA_NULL_FIELD_CREATE; cf_param.siteSubset = QUDA_FULL_SITE_SUBSET; - if (cloverInv && (dirac == QUDA_TWISTED_CLOVERPC_DIRAC)) { - cf_param.direct = false; - cpuCloverField cI(cf_param); - cloverInv->saveCPUField(cI); - cf_param.direct = true; - cpuCloverField c(cf_param); - clover->saveCPUField(c); - calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, cI, kappa, mu, dirac, matpc); - } else { - cpuCloverField c(cf_param); - if (clover) clover->saveCPUField(c); - calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, c, kappa, mu, dirac, matpc); - } + cpuCloverField c(cf_param); + if (clover) clover->saveCPUField(c); + calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, kappa, mu, dirac, matpc); if (&T.Vectors() != av) delete av; delete uv; diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp index 76f4a45d04..6d12fa2bc8 100644 --- a/lib/dirac_clover.cpp +++ b/lib/dirac_clover.cpp @@ -158,8 +158,7 @@ namespace quda { } void DiracClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { - cudaCloverField *cInv = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID); } DiracCloverPC::DiracCloverPC(const DiracParam ¶m) : @@ -379,8 +378,7 @@ namespace quda { } void DiracCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { - cudaCloverField *cInv = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp index 6fbf3f9e54..4f4bfc9477 100644 --- a/lib/dirac_twisted_clover.cpp +++ b/lib/dirac_twisted_clover.cpp @@ -14,14 +14,14 @@ namespace quda { } DiracTwistedClover::DiracTwistedClover(const DiracParam ¶m, const int nDim) - : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover)), cloverInv(*(param.cloverInv)) + : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover)) { twistedclover::initConstants(*param.gauge,profile); dslash_aux::initConstants(*param.gauge,profile); } DiracTwistedClover::DiracTwistedClover(const DiracTwistedClover &dirac) - : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover), cloverInv(dirac.cloverInv) + : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover) { twistedclover::initConstants(*dirac.gauge,profile); dslash_aux::initConstants(*dirac.gauge,profile); @@ -35,7 +35,6 @@ namespace quda { { DiracWilson::operator=(dirac); clover = dirac.clover; - cloverInv = dirac.cloverInv; } return *this; @@ -61,12 +60,9 @@ namespace quda { if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS) { - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); + double flavor_mu = in.TwistFlavor() * mu; twistCloverGamma5Cuda(&static_cast(out), &static_cast(in), @@ -78,9 +74,7 @@ namespace quda { flops += 552ll*in.Volume(); delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } else errorQuda("DiracTwistedClover::twistedCloverApply method for flavor doublet is not implemented..\n"); @@ -113,12 +107,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = 2.0 * kappa * in.TwistFlavor() * mu;//for direct twist (must be daggered separately) @@ -136,9 +126,7 @@ namespace quda { } deleteTmp(&tmp, reset); delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } void DiracTwistedClover::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const @@ -172,7 +160,7 @@ namespace quda { void DiracTwistedClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor(); - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID); } DiracTwistedCloverPC::DiracTwistedCloverPC(const DiracTwistedCloverPC &dirac) : DiracTwistedClover(dirac) { } @@ -213,12 +201,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = -2.0 * kappa * in.TwistFlavor() * mu; //for invert twist (not daggered) @@ -239,9 +223,7 @@ namespace quda { errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n"); } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } // xpay version of the above @@ -257,12 +239,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = -2.0 * kappa * in.TwistFlavor() * mu; //for invert twist @@ -287,9 +265,7 @@ namespace quda { errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n"); } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } void DiracTwistedCloverPC::M(ColorSpinorField &out, const ColorSpinorField &in) const @@ -298,12 +274,8 @@ namespace quda { bool reset = newTmp(&tmp1, in); - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ if (matpcType == QUDA_MATPC_EVEN_EVEN) { @@ -333,7 +305,7 @@ namespace quda { twistedCloverDslashCuda(&static_cast(out), *gauge, cs, cI, static_cast(tmp1), QUDA_EVEN_PARITY, dagger, &static_cast(in), - QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa, 0.0, 0.0, commDim, profile); + QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa2, 0.0, 0.0, commDim, profile); flops += (1320ll+96ll)*in.Volume(); } else if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { @@ -341,7 +313,7 @@ namespace quda { twistedCloverDslashCuda(&static_cast(out), *gauge, cs, cI, static_cast(tmp1), QUDA_ODD_PARITY, dagger, &static_cast(in), - QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa, 0.0, 0.0, commDim, profile); + QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa2, 0.0, 0.0, commDim, profile); flops += (1320ll+96ll)*in.Volume(); }else { // symmetric preconditioning errorQuda("Invalid matpcType"); @@ -352,9 +324,7 @@ namespace quda { } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif deleteTmp(&tmp1, reset); } @@ -386,27 +356,27 @@ namespace quda { if (matpcType == QUDA_MATPC_EVEN_EVEN) { // src = A_ee^-1 (b_e + k D_eo A_oo^-1 b_o) src = &(x.Odd()); - TwistCloverInv(*src, b.Odd(), 1); + TwistCloverInv(*src, b.Odd(), QUDA_ODD_PARITY); DiracWilson::DslashXpay(*tmp1, *src, QUDA_EVEN_PARITY, b.Even(), kappa); - TwistCloverInv(*src, *tmp1, 0); + TwistCloverInv(*src, *tmp1, QUDA_EVEN_PARITY); sol = &(x.Even()); } else if (matpcType == QUDA_MATPC_ODD_ODD) { // src = A_oo^-1 (b_o + k D_oe A_ee^-1 b_e) src = &(x.Even()); - TwistCloverInv(*src, b.Even(), 0); + TwistCloverInv(*src, b.Even(), QUDA_EVEN_PARITY); DiracWilson::DslashXpay(*tmp1, *src, QUDA_ODD_PARITY, b.Odd(), kappa); - TwistCloverInv(*src, *tmp1, 1); + TwistCloverInv(*src, *tmp1, QUDA_ODD_PARITY); sol = &(x.Odd()); } else if (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { // src = b_e + k D_eo A_oo^-1 b_o src = &(x.Odd()); - TwistCloverInv(*tmp1, b.Odd(), 1); // safe even when *tmp1 = b.odd + TwistCloverInv(*tmp1, b.Odd(), QUDA_ODD_PARITY); // safe even when *tmp1 = b.odd DiracWilson::DslashXpay(*src, *tmp1, QUDA_EVEN_PARITY, b.Even(), kappa); sol = &(x.Even()); } else if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { // src = b_o + k D_oe A_ee^-1 b_e src = &(x.Even()); - TwistCloverInv(*tmp1, b.Even(), 0); // safe even when *tmp1 = b.even + TwistCloverInv(*tmp1, b.Even(), QUDA_EVEN_PARITY); // safe even when *tmp1 = b.even DiracWilson::DslashXpay(*src, *tmp1, QUDA_ODD_PARITY, b.Odd(), kappa); sol = &(x.Odd()); } else { @@ -436,11 +406,11 @@ namespace quda { if (matpcType == QUDA_MATPC_EVEN_EVEN || matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { // x_o = A_oo^-1 (b_o + k D_oe x_e) DiracWilson::DslashXpay(*tmp1, x.Even(), QUDA_ODD_PARITY, b.Odd(), kappa); - TwistCloverInv(x.Odd(), *tmp1, 1); + TwistCloverInv(x.Odd(), *tmp1, QUDA_ODD_PARITY); } else if (matpcType == QUDA_MATPC_ODD_ODD || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { // x_e = A_ee^-1 (b_e + k D_eo x_o) DiracWilson::DslashXpay(*tmp1, x.Odd(), QUDA_EVEN_PARITY, b.Even(), kappa); - TwistCloverInv(x.Even(), *tmp1, 0); + TwistCloverInv(x.Even(), *tmp1, QUDA_EVEN_PARITY); } else { errorQuda("MatPCType %d not valid for DiracTwistedCloverPC", matpcType); } @@ -452,6 +422,6 @@ namespace quda { void DiracTwistedCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor(); - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_twisted_mass.cpp b/lib/dirac_twisted_mass.cpp index 25088fbeb8..36c404067b 100644 --- a/lib/dirac_twisted_mass.cpp +++ b/lib/dirac_twisted_mass.cpp @@ -189,7 +189,7 @@ namespace quda { void DiracTwistedMass::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor(); cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID); } DiracTwistedMassPC::DiracTwistedMassPC(const DiracTwistedMassPC &dirac) : DiracTwistedMass(dirac) { } @@ -548,6 +548,6 @@ namespace quda { void DiracTwistedMassPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor(); cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp index 5b66da3077..cc27de9134 100644 --- a/lib/dirac_wilson.cpp +++ b/lib/dirac_wilson.cpp @@ -158,7 +158,7 @@ namespace quda { void DiracWilson::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID); } DiracWilsonPC::DiracWilsonPC(const DiracParam ¶m) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index a7e50db23c..64dab6ebd6 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -130,10 +130,6 @@ cudaCloverField *cloverPrecise = NULL; cudaCloverField *cloverSloppy = NULL; cudaCloverField *cloverPrecondition = NULL; -cudaCloverField *cloverInvPrecise = NULL; -cudaCloverField *cloverInvSloppy = NULL; -cudaCloverField *cloverInvPrecondition = NULL; - cudaGaugeField *momResident = NULL; cudaGaugeField *extendedGaugeResident = NULL; @@ -699,6 +695,8 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { + if (!gaugePrecise) errorQuda("Cannot call loadCloverQuda with no resident gauge field"); + profileClover.TPSTART(QUDA_PROFILE_TOTAL); bool device_calc = false; // calculate clover and inverse on the device? @@ -707,23 +705,16 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (!initialized) errorQuda("QUDA not initialized"); - if (!h_clover && !h_clovinv) { - if(inv_param->clover_coeff != 0){ - device_calc = true; - }else{ - errorQuda("loadCloverQuda() called with neither clover term nor inverse"); - } + if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) { + device_calc = true; + if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set"); + if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field"); } - - if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) { - errorQuda("Half precision not supported on CPU"); - } - if (gaugePrecise == NULL) { - errorQuda("Gauge field must be loaded before clover"); - } + if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) errorQuda("Half precision not supported on CPU"); + if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover"); if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) { - errorQuda("Wrong dslash_type in loadCloverQuda()"); + errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type); } // determines whether operator is preconditioned when calling invertQuda() @@ -751,243 +742,151 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) warningQuda("Uninverted clover term not loaded"); } - CloverFieldParam clover_param; - CloverField *in=NULL; -#ifndef DYNAMIC_CLOVER - CloverField *inInv=NULL; + bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false; +#ifdef DYNAMIC_CLOVER + bool dynamic_clover = twisted ? true : false; // dynamic clover only supported on twisted clover currently +#else + bool dynamic_clover = false; #endif - if(!device_calc){ - // create a param for the cpu clover field - profileClover.TPSTART(QUDA_PROFILE_INIT); - CloverFieldParam cpuParam; - cpuParam.nDim = 4; - for (int i=0; i<4; i++) cpuParam.x[i] = gaugePrecise->X()[i]; - cpuParam.precision = inv_param->clover_cpu_prec; - cpuParam.order = inv_param->clover_order; - cpuParam.direct = h_clover ? true : false; - cpuParam.inverse = h_clovinv ? true : false; - cpuParam.clover = h_clover; - cpuParam.norm = 0; - cpuParam.cloverInv = h_clovinv; - cpuParam.invNorm = 0; - cpuParam.create = QUDA_REFERENCE_FIELD_CREATE; - cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET; - cpuParam.twisted = false; - cpuParam.mu2 = 0.; - - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cpuParam.direct = true; - cpuParam.inverse = false; - cpuParam.cloverInv = NULL; - cpuParam.clover = h_clover; - cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); - -#ifndef DYNAMIC_CLOVER - cpuParam.cloverInv = h_clovinv; - cpuParam.clover = NULL; - cpuParam.twisted = true; - cpuParam.direct = true; - cpuParam.inverse = false; - cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - - inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); -#endif - } else { - in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); - } + CloverFieldParam clover_param; + clover_param.nDim = 4; + clover_param.twisted = twisted; + clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0; + clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; + for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; + clover_param.pad = inv_param->cl_pad; + clover_param.create = QUDA_NULL_FIELD_CREATE; + clover_param.norm = nullptr; + clover_param.invNorm = nullptr; + clover_param.setPrecision(inv_param->clover_cuda_prec); + clover_param.direct = h_clover || device_calc ? true : false; + clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false; - clover_param.nDim = 4; - for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; - clover_param.setPrecision(inv_param->clover_cuda_prec); - clover_param.pad = inv_param->cl_pad; - clover_param.direct = h_clover ? true : false; - clover_param.inverse = (h_clovinv || pc_solve) ? true : false; - clover_param.create = QUDA_NULL_FIELD_CREATE; - clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; - - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.direct = true; - clover_param.inverse = false; - clover_param.twisted = true; - cloverPrecise = new cudaCloverField(clover_param); -#ifndef DYNAMIC_CLOVER - clover_param.direct = false; - clover_param.inverse = true; - clover_param.twisted = true; - cloverInvPrecise = new cudaCloverField(clover_param); -// clover_param.twisted = false; -#endif - } else { - cloverPrecise = new cudaCloverField(clover_param); - } + cloverPrecise = new cudaCloverField(clover_param); + + CloverField *in = nullptr; + if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) { + // create a param for the cpu clover field + profileClover.TPSTART(QUDA_PROFILE_INIT); + CloverFieldParam inParam(clover_param); + inParam.precision = inv_param->clover_cpu_prec; + inParam.order = inv_param->clover_order; + inParam.direct = h_clover ? true : false; + inParam.inverse = h_clovinv ? true : false; + inParam.clover = h_clover; + inParam.cloverInv = h_clovinv; + inParam.create = QUDA_REFERENCE_FIELD_CREATE; + in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? + static_cast(new cpuCloverField(inParam)) : + static_cast(new cudaCloverField(inParam)); profileClover.TPSTOP(QUDA_PROFILE_INIT); + } + if (!device_calc) { profileClover.TPSTART(QUDA_PROFILE_H2D); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverPrecise->copy(*in, false); -#ifndef DYNAMIC_CLOVER - cloverInvPrecise->copy(*in, true); - cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); -#endif - } else { - cloverPrecise->copy(*in, h_clovinv ? true : false); - } - + cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false); profileClover.TPSTOP(QUDA_PROFILE_H2D); } else { profileClover.TPSTART(QUDA_PROFILE_COMPUTE); - createCloverQuda(inv_param); - -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); - if (inv_param->compute_clover_trlog) { - inv_param->trlogA[0] = cloverInvPrecise->TrLog()[0]; - inv_param->trlogA[1] = cloverInvPrecise->TrLog()[1]; - } - } -#endif profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); } // inverted clover term is required when applying preconditioned operator - if ((!h_clovinv && pc_solve) && inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { + if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) { profileClover.TPSTART(QUDA_PROFILE_COMPUTE); - cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); - profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); - if (inv_param->compute_clover_trlog) { - inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; - inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + if (!dynamic_clover) { + cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + if (inv_param->compute_clover_trlog) { + inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; + inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + } } + profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); } -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) - inv_param->cloverGiB = cloverPrecise->GBytes(); - else - inv_param->cloverGiB = cloverPrecise->GBytes() + cloverInvPrecise->GBytes(); -#else inv_param->cloverGiB = cloverPrecise->GBytes(); -#endif - clover_param.norm = 0; - clover_param.invNorm = 0; - clover_param.mu2 = 0.; - clover_param.nDim = 4; - for(int dir=0; dir<4; ++dir) clover_param.x[dir] = gaugePrecise->X()[dir]; - clover_param.pad = inv_param->cl_pad; - clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; - clover_param.create = QUDA_NULL_FIELD_CREATE; clover_param.direct = true; - clover_param.inverse = true; + clover_param.inverse = dynamic_clover ? false : true; // create the mirror sloppy clover field if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { profileClover.TPSTART(QUDA_PROFILE_INIT); + clover_param.setPrecision(inv_param->clover_cuda_prec_sloppy); + cloverSloppy = new cudaCloverField(clover_param); + cloverSloppy->copy(*cloverPrecise, clover_param.inverse); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - clover_param.twisted = true; -#ifndef DYNAMIC_CLOVER - clover_param.direct = false; - clover_param.inverse = true; - cloverInvSloppy = new cudaCloverField(clover_param); - cloverInvSloppy->copy(*cloverInvPrecise, true); - clover_param.direct = true; - clover_param.inverse = false; - inv_param->cloverGiB += cloverInvSloppy->GBytes(); -#endif - cloverSloppy = new cudaCloverField(clover_param); - cloverSloppy->copy(*cloverPrecise); - inv_param->cloverGiB += cloverSloppy->GBytes(); - } else { - cloverSloppy = new cudaCloverField(clover_param); - cloverSloppy->copy(*cloverPrecise); - inv_param->cloverGiB += cloverSloppy->GBytes(); - } + inv_param->cloverGiB += cloverSloppy->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { cloverSloppy = cloverPrecise; -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - cloverInvSloppy = cloverInvPrecise; -#endif } // create the mirror preconditioner clover field if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition && inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) { profileClover.TPSTART(QUDA_PROFILE_INIT); + clover_param.setPrecision(inv_param->clover_cuda_prec_precondition); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.direct = true; - clover_param.inverse = false; - cloverPrecondition = new cudaCloverField(clover_param); - cloverPrecondition->copy(*cloverSloppy); - inv_param->cloverGiB += cloverPrecondition->GBytes(); -#ifndef DYNAMIC_CLOVER - clover_param.direct = false; - clover_param.inverse = true; - clover_param.twisted = true; - cloverInvPrecondition = new cudaCloverField(clover_param); - cloverInvPrecondition->copy(*cloverInvSloppy, true); - inv_param->cloverGiB += cloverInvPrecondition->GBytes(); -#endif - } else { - cloverPrecondition = new cudaCloverField(clover_param); - cloverPrecondition->copy(*cloverSloppy); - inv_param->cloverGiB += cloverPrecondition->GBytes(); - } + cloverPrecondition = new cudaCloverField(clover_param); + cloverPrecondition->copy(*cloverSloppy, clover_param.inverse); + + inv_param->cloverGiB += cloverPrecondition->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { cloverPrecondition = cloverSloppy; -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - cloverInvPrecondition = cloverInvSloppy; -#endif } - // need to copy back the odd inverse field into the application clover field - if (!h_clovinv && pc_solve && !device_calc) { + // if requested, copy back the clover / inverse field + if ( inv_param->return_clover || inv_param->return_clover_inverse ) { + if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set"); + // copy the inverted clover term into host application order on the device clover_param.setPrecision(inv_param->clover_cpu_prec); - clover_param.direct = false; - clover_param.inverse = true; - clover_param.order = inv_param->clover_order; + clover_param.direct = (h_clover && inv_param->return_clover); + clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse); // this isn't really "epilogue" but this label suffices profileClover.TPSTART(QUDA_PROFILE_EPILOGUE); - cudaCloverField hack(clover_param); - hack.copy(*cloverPrecise); + cudaCloverField *hack = nullptr; + if (!dynamic_clover) { + clover_param.order = inv_param->clover_order; + hack = new cudaCloverField(clover_param); + hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse + } else { + cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack + hackOfTheHack->copy(*cloverPrecise, false); + cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + if (inv_param->compute_clover_trlog) { + inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; + inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + } + clover_param.order = inv_param->clover_order; + hack = new cudaCloverField(clover_param); + hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse + delete hackOfTheHack; + } profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE); - // copy the odd components into the host application's clover field + // copy the field into the host application's clover field profileClover.TPSTART(QUDA_PROFILE_D2H); - qudaMemcpy((char*)(in->V(false))+in->Bytes()/2, (char*)(hack.V(true))+hack.Bytes()/2, - in->Bytes()/2, cudaMemcpyDeviceToHost); + if (inv_param->return_clover) { + qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost); + } + if (inv_param->return_clover_inverse) { + qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost); + } profileClover.TPSTOP(QUDA_PROFILE_D2H); + delete hack; checkCudaError(); } - if(!device_calc) - { - if (in) delete in; // delete object referencing input field -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv; -#endif - } + if (in) delete in; // delete object referencing input field popVerbosity(); @@ -1174,16 +1073,6 @@ void freeCloverQuda(void) cloverPrecondition = NULL; cloverSloppy = NULL; cloverPrecise = NULL; - - if (cloverInvPrecise != NULL) { - if (cloverInvPrecondition != cloverInvSloppy && cloverInvPrecondition) delete cloverInvPrecondition; - if (cloverInvSloppy != cloverInvPrecise && cloverInvSloppy) delete cloverInvSloppy; - if (cloverInvPrecise) delete cloverInvPrecise; - - cloverInvPrecondition = NULL; - cloverInvSloppy = NULL; - cloverInvPrecise = NULL; - } } void endQuda(void) @@ -1358,7 +1247,6 @@ namespace quda { diracParam.fatGauge = gaugeFatPrecise; diracParam.longGauge = gaugeLongPrecise; diracParam.clover = cloverPrecise; - diracParam.cloverInv = cloverInvPrecise; diracParam.kappa = kappa; diracParam.mass = inv_param->mass; diracParam.m5 = inv_param->m5; @@ -1376,7 +1264,6 @@ namespace quda { diracParam.fatGauge = gaugeFatSloppy; diracParam.longGauge = gaugeLongSloppy; diracParam.clover = cloverSloppy; - diracParam.cloverInv = cloverInvSloppy; for (int i=0; i<4; i++) { diracParam.commDim[i] = 1; // comms are always on @@ -1399,7 +1286,6 @@ namespace quda { diracParam.longGauge = gaugeLongPrecondition; } diracParam.clover = cloverPrecondition; - diracParam.cloverInv = cloverInvPrecondition; for (int i=0; i<4; i++) { diracParam.commDim[i] = comms ? 1 : 0; @@ -1524,8 +1410,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); pushVerbosity(inv_param->verbosity); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); @@ -1748,8 +1632,6 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || @@ -1821,8 +1703,6 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || @@ -2901,7 +2781,6 @@ void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h diracHalfPrecParam.longGauge = gaugeLongPrecondition; diracHalfPrecParam.clover = cloverPrecondition; - diracHalfPrecParam.cloverInv = cloverInvPrecondition; for (int i=0; i<4; i++) { diracHalfPrecParam.commDim[i] = 1; // comms are on. @@ -3538,37 +3417,7 @@ void createCloverQuda(QudaInvertParam* invertParam) { profileCloverCreate.TPSTART(QUDA_PROFILE_TOTAL); profileCloverCreate.TPSTART(QUDA_PROFILE_INIT); - if(!cloverPrecise){ - CloverFieldParam cloverParam; - cloverParam.nDim = 4; - for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir]; - cloverParam.setPrecision(invertParam->clover_cuda_prec); - cloverParam.pad = invertParam->cl_pad; - cloverParam.direct = true; - cloverParam.inverse = true; - cloverParam.norm = 0; - cloverParam.invNorm = 0; - cloverParam.twisted = false; - cloverParam.create = QUDA_NULL_FIELD_CREATE; - cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET; - cloverParam.setPrecision(invertParam->cuda_prec); - if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - { - - cloverParam.twisted = true; - cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu; - cloverParam.direct = true; - cloverParam.inverse = false; - cloverPrecise = new cudaCloverField(cloverParam); -#ifndef DYNAMIC_CLOVER - cloverParam.inverse = true; - cloverParam.direct = false; - cloverInvPrecise = new cudaCloverField(cloverParam); //FIXME Only with tmClover -#endif - } else { - cloverPrecise = new cudaCloverField(cloverParam); - } - } + if (!cloverPrecise) errorQuda("Clover field not allocated"); int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction int y[4]; @@ -3595,36 +3444,11 @@ void createCloverQuda(QudaInvertParam* invertParam) // copy gaugePrecise into the extended device gauge field copyExtendedGauge(*cudaGaugeExtended, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION); -#if 1 - profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); - profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS); - cudaGaugeExtended->exchangeExtendedGhost(R,true); - profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS); -#else - - GaugeFieldParam gParam(gaugePrecise->X(), gaugePrecise->Precision(), QUDA_RECONSTRUCT_NO, - pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO); - gParam.create = QUDA_ZERO_FIELD_CREATE; - gParam.order = QUDA_MILC_GAUGE_ORDER; - gParam.siteSubset = QUDA_FULL_SITE_SUBSET; - gParam.t_boundary = gaugePrecise->TBoundary(); - gParam.nFace = 1; - - // create an extended gauge field on the host - for(int dir=0; dir<4; ++dir) gParam.x[dir] += 4; - cpuGaugeField cpuGaugeExtended(gParam); - cudaGaugeExtended->saveCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION); profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); - // communicate data profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS); - //exchange_cpu_sitelink_ex(const_cast(gaugePrecise->X()), R, (void**)cpuGaugeExtended.Gauge_p(), - // cpuGaugeExtended.Order(),cpuGaugeExtended.Precision(), 0, 4); - cpuGaugeExtended.exchangeExtendedGhost(R,true); - - cudaGaugeExtended->loadCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION); + cudaGaugeExtended->exchangeExtendedGhost(R,true); profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS); -#endif } #ifdef MULTI_GPU @@ -3633,7 +3457,6 @@ void createCloverQuda(QudaInvertParam* invertParam) GaugeField *gauge = gaugePrecise; #endif - profileCloverCreate.TPSTART(QUDA_PROFILE_INIT); // create the Fmunu field GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY); @@ -3644,16 +3467,8 @@ void createCloverQuda(QudaInvertParam* invertParam) profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); profileCloverCreate.TPSTART(QUDA_PROFILE_COMPUTE); - computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION); computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); - -#ifndef DYNAMIC_CLOVER - if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - computeClover(*cloverInvPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); // FIXME only with tmClover - } -#endif - profileCloverCreate.TPSTOP(QUDA_PROFILE_COMPUTE); profileCloverCreate.TPSTOP(QUDA_PROFILE_TOTAL); diff --git a/lib/tune.cpp b/lib/tune.cpp index 7889039fc4..16e8906de9 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -536,7 +536,7 @@ namespace quda { } else if (!tuning) { TuneParam best_param; - cudaError_t error; + cudaError_t error = cudaSuccess; cudaEvent_t start, end; float elapsed_time, best_time; time_t now; @@ -599,7 +599,7 @@ namespace quda { } if (best_time == FLT_MAX) { - errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume); + errorQuda("Auto-tuning failed for %s with %s at vol=%s, error %s", key.name, key.aux, key.volume, cudaGetErrorString(error)); } if (verbosity >= QUDA_VERBOSE) { printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(), diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4c2a793310..bfd8d08ef8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -26,11 +26,11 @@ endif() #define tests -if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_DOMAIN_WALL}) - cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp) +if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_CLOVER} OR ${QUDA_DIRAC_TWISTED_MASS} OR ${QUDA_DIRAC_TWISTED_CLOVER} OR ${QUDA_DIRAC_DOMAIN_WALL}) + cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp) target_link_libraries(dslash_test ${TEST_LIBS} ) - cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp) target_link_libraries(invert_test ${TEST_LIBS}) endif() @@ -38,15 +38,15 @@ cuda_add_executable(deflation_test deflation_test.cpp wilson_dslash_reference.cp target_link_libraries(deflation_test ${TEST_LIBS}) if(${QUDA_DIRAC_STAGGERED}) - cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp staggered_dslash_reference.cpp) + cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) target_link_libraries(staggered_dslash_test ${TEST_LIBS}) - cuda_add_executable(staggered_invert_test staggered_invert_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(staggered_invert_test staggered_invert_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) target_link_libraries(staggered_invert_test ${TEST_LIBS}) endif() if(${QUDA_MULTIGRID}) - cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp clover_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) target_link_libraries(multigrid_invert_test ${TEST_LIBS}) cuda_add_executable(multigrid_benchmark_test multigrid_benchmark_test.cu) diff --git a/tests/Makefile b/tests/Makefile index c37e85ddc8..f948fa13ba 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -56,13 +56,13 @@ TESTS = su3_test pack_test blas_test dslash_test invert_test \ all: $(TESTS) -dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o domain_wall_dslash_reference.o misc.o $(QUDA) +dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -invert_test: invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) +invert_test: invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) +multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA) @@ -71,7 +71,7 @@ multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA) deflation_test: deflation_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o $(QUDA) +staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) staggered_invert_test: staggered_invert_test.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA) diff --git a/tests/blas_reference.cpp b/tests/blas_reference.cpp index 1651734150..056926d1ef 100644 --- a/tests/blas_reference.cpp +++ b/tests/blas_reference.cpp @@ -50,6 +50,18 @@ double norm_2(void *v, int len, QudaPrecision precision) { else return norm2((float*)v, len); } +// performs the operation y[i] = x[i] + a*y[i] +template +static inline void xpay(Float *x, Float a, Float *y, int len) { + for (int i=0; i +#include +#include +#include + +#include +#include +#include +#include + + +/** + @brief Apply the clover matrix field + @param[out] out Result field (single parity) + @param[in] clover Clover-matrix field (full field) + @param[in] in Input field (single parity) + @param[in] parity Parity to which we are applying the clover field + */ +template +void cloverReference(sFloat *out, cFloat *clover, sFloat *in, int parity) { + int nSpin = 4; + int nColor = 3; + int N = nColor * nSpin / 2; + int chiralBlock = N + 2*(N-1)*N/2; + + for (int i=0; i *In = reinterpret_cast*>(&in[i*nSpin*nColor*2]); + std::complex *Out = reinterpret_cast*>(&out[i*nSpin*nColor*2]); + + for (int chi=0; chi *L = reinterpret_cast*>(&D[N]); + + for (int s_col=0; s_col(out), static_cast(clover), static_cast(in), parity); + break; + case QUDA_SINGLE_PRECISION: + cloverReference(static_cast(out), static_cast(clover), static_cast(in), parity); + break; + default: + errorQuda("Unsupported precision %d", precision); + } + +} + +void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, + int dagger, QudaPrecision precision, QudaGaugeParam ¶m) { + void *tmp = malloc(Vh*spinorSiteSize*precision); + + wil_dslash(tmp, gauge, in, parity, dagger, precision, param); + apply_clover(out, clover, tmp, parity, precision); + + free(tmp); +} + +// Apply the even-odd preconditioned Wilson-clover operator +void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + double kappa2 = -kappa*kappa; + void *tmp = malloc(Vh*spinorSiteSize*precision); + + switch(matpc_type) { + case QUDA_MATPC_EVEN_EVEN: + if (!dagger) { + wil_dslash(tmp, gauge, in, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + wil_dslash(tmp, gauge, out, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + } else { + apply_clover(tmp, clover_inv, in, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC: + wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover, in, 0, precision); + xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD: + if (!dagger) { + wil_dslash(tmp, gauge, in, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + wil_dslash(tmp, gauge, out, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + } else { + apply_clover(tmp, clover_inv, in, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD_ASYMMETRIC: + wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover, in, 1, precision); + xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + break; + default: + errorQuda("Unsupoorted matpc=%d", matpc_type); + } + + free(tmp); +} + +// Apply the full Wilson-clover operator +void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, + int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + void *tmp = malloc(V*spinorSiteSize*precision); + + void *inEven = in; + void *inOdd = (char*)in + Vh*spinorSiteSize*precision; + void *outEven = out; + void *outOdd = (char*)out + Vh*spinorSiteSize*precision; + void *tmpEven = tmp; + void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision; + + // Odd part + wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param); + apply_clover(tmpOdd, clover, inOdd, 1, precision); + + // Even part + wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param); + apply_clover(tmpEven, clover, inEven, 0, precision); + + // lastly apply the kappa term + xpay(tmp, -kappa, out, V*spinorSiteSize, precision); + + free(tmp); +} + +void applyTwist(void *out, void *in, void *tmpH, double a, QudaPrecision precision) { + switch (precision) { + case QUDA_DOUBLE_PRECISION: + for(int i = 0; i < Vh; i++) + for(int s = 0; s < 4; s++) { + double a5 = ((s / 2) ? -1.0 : +1.0) * a; + for(int c = 0; c < 3; c++) { + ((double *) out)[i * 24 + s * 6 + c * 2 + 0] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((double *) in)[i * 24 + s * 6 + c * 2 + 1]; + ((double *) out)[i * 24 + s * 6 + c * 2 + 1] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((double *) in)[i * 24 + s * 6 + c * 2 + 0]; + } + } + break; + case QUDA_SINGLE_PRECISION: + for(int i = 0; i < Vh; i++) + for(int s = 0; s < 4; s++) { + float a5 = ((s / 2) ? -1.0 : +1.0) * a; + for(int c = 0; c < 3; c++) { + ((float *) out)[i * 24 + s * 6 + c * 2 + 0] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((float *) in)[i * 24 + s * 6 + c * 2 + 1]; + ((float *) out)[i * 24 + s * 6 + c * 2 + 1] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((float *) in)[i * 24 + s * 6 + c * 2 + 0]; + } + } + break; + default: + errorQuda("Unsupported precision %d", precision); + } +} + +// Apply (C + i*a*gamma_5)/(C^2 + a^2) +void twistCloverGamma5(void *out, void *in, void *clover, void *cInv, const int dagger, const double kappa, const double mu, + const QudaTwistFlavorType flavor, const int parity, QudaTwistGamma5Type twist, QudaPrecision precision) { + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + double a = 0.0; + + if (twist == QUDA_TWIST_GAMMA5_DIRECT) { + a = 2.0 * kappa * mu * flavor; + + if (dagger) a *= -1.0; + + apply_clover(tmp1, clover, in, parity, precision); + applyTwist(out, in, tmp1, a, precision); + } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { + a = -2.0 * kappa * mu * flavor; + + if (dagger) a *= -1.0; + + apply_clover(tmp1, clover, in, parity, precision); + applyTwist(tmp2, in, tmp1, a, precision); + apply_clover(out, cInv, tmp2, parity, precision); + } else { + printf("Twist type %d not defined\n", twist); + exit(0); + } + + free(tmp2); + free(tmp1); +} + +void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam ¶m) { + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + if (dagger) { + twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1-parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { + wil_dslash(tmp2, gauge, tmp1, parity, dagger, precision, param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + wil_dslash(out, gauge, tmp1, parity, dagger, precision, param); + } + } else { + wil_dslash(tmp1, gauge, in, parity, dagger, precision, param); + twistCloverGamma5(out, tmp1, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + } + + free(tmp2); + free(tmp1); +} + +// Apply the full twisted-clover operator +void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, + QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + void *tmp = malloc(V*spinorSiteSize*precision); + + void *inEven = in; + void *inOdd = (char*)in + Vh*spinorSiteSize*precision; + void *outEven = out; + void *outOdd = (char*)out + Vh*spinorSiteSize*precision; + void *tmpEven = tmp; + void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision; + + // Odd part + wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmpOdd, inOdd, clover, NULL, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision); + + // Even part + wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmpEven, inEven, clover, NULL, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision); + + // lastly apply the kappa term + xpay(tmp, -kappa, out, V*spinorSiteSize, precision); + + free(tmp); +} + +// Apply the even-odd preconditioned Dirac operator +void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + double kappa2 = -kappa*kappa; + + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + switch(matpc_type) { + case QUDA_MATPC_EVEN_EVEN: + if (!dagger) { + wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp2, gauge, tmp1, 0, dagger, precision, gauge_param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp1, gauge, out, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC: + wil_dslash(tmp1, gauge, in, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision); + xpay(tmp2, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD: + if (!dagger) { + wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp2, gauge, tmp1, 1, dagger, precision, gauge_param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp1, gauge, out, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD_ASYMMETRIC: + wil_dslash(tmp1, gauge, in, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision); + xpay(tmp1, kappa2, out, Vh*spinorSiteSize, precision); + break; + default: + errorQuda("Unsupported matpc=%d", matpc_type); + } + + free(tmp2); + free(tmp1); +} diff --git a/tests/domain_wall_dslash_reference.cpp b/tests/domain_wall_dslash_reference.cpp index 2b5ce2dd90..a2e53ff9a8 100644 --- a/tests/domain_wall_dslash_reference.cpp +++ b/tests/domain_wall_dslash_reference.cpp @@ -16,12 +16,6 @@ using namespace quda; -void xpay(void *x, double a, void *y, int length, QudaPrecision precision) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)x, a, (double*)y, length); - else xpay((float*)x, (float)a, (float*)y, length); -} - - // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. // @@ -839,17 +833,12 @@ void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, if (precision == QUDA_DOUBLE_PRECISION) { if (zero_initialize) dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm); else dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm); - for(int xs = 0 ; xs < Ls ; xs++) - { - xpay((double*)in + Vh*spinorSiteSize*xs, kappa[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } } else { if (zero_initialize) dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm); else dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm); - for(int xs = 0 ; xs < Ls ; xs++) - { - xpay((float*)in + Vh*spinorSiteSize*xs, (float)(kappa[xs]), (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } + } + for(int xs = 0 ; xs < Ls ; xs++) { + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa[xs], (char*)out + precision*Vh*spinorSiteSize*xs, Vh*spinorSiteSize, precision); } } @@ -921,10 +910,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c mdw_dslash_5(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outOdd + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } mdw_dslash_4_pre(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, b5, c5, true); @@ -932,10 +919,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c mdw_dslash_5(tmp, gauge, inEven, 0, dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outEven + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } free(kappa5); @@ -1040,10 +1025,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); dslash_5_inv(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, kappa_mdwf); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (symmetric && dagger) { dslash_5_inv(tmp, gauge, in, parity[1], dagger, precision, gauge_param, mferm, kappa_mdwf); @@ -1053,10 +1036,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_4_pre(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (!symmetric && !dagger) { mdw_dslash_4_pre(out, gauge, in, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true); @@ -1066,8 +1047,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (!symmetric && dagger) { dslash_4_4d(out, gauge, in, parity[0], dagger, precision, gauge_param, mferm); @@ -1076,11 +1057,10 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_4_pre(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, b5, c5, true); mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true); - for(int xs = 0 ; xs < Ls ; xs++) - { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } + for(int xs = 0 ; xs < Ls ; xs++) { + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); + } } else { errorQuda("Unsupported matpc_type=%d dagger=%d", matpc_type, dagger); } diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index a84957f955..63638913e9 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -69,12 +69,17 @@ extern QudaPrecision prec; extern QudaDagType dagger; QudaDagType not_dagger; +extern bool compute_clover; +extern double clover_coeff; + extern bool verify_results; extern int niter; extern char latfile[]; extern bool kernel_pack_t; +QudaVerbosity verbosity = QUDA_VERBOSE; + void init(int argc, char **argv) { cuda_prec = prec; @@ -142,6 +147,7 @@ void init(int argc, char **argv) { inv_param.Ls = (inv_param.twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) ? Ls : 2; + inv_param.solve_type = (test_type == 2 || test_type == 4) ? QUDA_DIRECT_SOLVE : QUDA_DIRECT_PC_SOLVE; inv_param.matpc_type = matpc_type; inv_param.dagger = dagger; not_dagger = (QudaDagType)((dagger + 1)%2); @@ -234,20 +240,11 @@ void init(int argc, char **argv) { inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; - inv_param.clover_coeff = 1.5*inv_param.kappa; - //if (test_type > 0) { - hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); - hostCloverInv = hostClover; // fake it - /*} else { - hostClover = NULL; - hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); - }*/ - } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { - + inv_param.clover_coeff = clover_coeff; + hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); + hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); } - setVerbosity(QUDA_VERBOSE); - // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); @@ -255,9 +252,6 @@ void init(int argc, char **argv) { csParam.nColor = 3; csParam.nSpin = 4; - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - csParam.twistFlavor = inv_param.twist_flavor; - } csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || @@ -274,7 +268,7 @@ void init(int argc, char **argv) { } //ndeg_tm - if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { csParam.twistFlavor = inv_param.twist_flavor; csParam.nDim = (inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) ? 4 : 5; csParam.x[4] = inv_param.Ls; @@ -284,15 +278,11 @@ void init(int argc, char **argv) { csParam.precision = inv_param.cpu_prec; csParam.pad = 0; - if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || - dslash_type == QUDA_MOBIUS_DWF_DSLASH) - { + if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; - - } else - { - if (test_type < 2 || test_type ==3) { + } else { + if (test_type < 2 || test_type == 3) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { @@ -321,33 +311,40 @@ void init(int argc, char **argv) { construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); } - spinor->Source(QUDA_RANDOM_SOURCE); + spinor->Source(QUDA_RANDOM_SOURCE, 0); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - double norm = 0.0; // clover components are random numbers in the range (-norm, norm) + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + double norm = 0.1; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal - - if (test_type == 2 || test_type == 4) { - construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); - } else { - construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); - } + construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); + memcpy(hostCloverInv, hostClover, V*cloverSiteSize*inv_param.clover_cpu_prec); } + printfQuda("done.\n"); fflush(stdout); initQuda(device); + if (tune) { + setTuning(QUDA_TUNE_YES); + printfQuda("Tuning...\n"); + } + + // set verbosity prior to loadGaugeQuda + setVerbosity(verbosity); + inv_param.verbosity = verbosity; + printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - printfQuda("Sending clover field to GPU\n"); - loadCloverQuda(hostClover, hostCloverInv, &inv_param); - } + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (compute_clover) printfQuda("Computing clover field on GPU\n"); + else printfQuda("Sending clover field to GPU\n"); + inv_param.compute_clover = compute_clover; + inv_param.return_clover = compute_clover; + inv_param.compute_clover_inverse = compute_clover; + inv_param.return_clover_inverse = compute_clover; - if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - printfQuda("Sending clover field to GPU\n"); - loadCloverQuda(NULL, NULL, &inv_param); + loadCloverQuda(hostClover, hostCloverInv, &inv_param); } if (!transfer) { @@ -444,8 +441,8 @@ void end() { delete spinorTmp; for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); - if((dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){ - if (hostClover != hostCloverInv && hostClover) free(hostClover); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + free(hostClover); free(hostCloverInv); } endQuda(); @@ -548,12 +545,16 @@ double dslashCUDA(int niter) { } else { switch (test_type) { case 0: - if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH && (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD)) { + if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { - ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2); - dirac->Dslash(*cudaSpinorOut, *tmp1, parity); + if (dagger) { + ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2); + dirac->Dslash(*cudaSpinorOut, *tmp1, parity); + } else { + dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); + } } } else { if (transfer) { @@ -618,13 +619,12 @@ void dslashRef() { printfQuda("Calculating reference implementation..."); fflush(stdout); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || - dslash_type == QUDA_WILSON_DSLASH) { + if (dslash_type == QUDA_WILSON_DSLASH) { switch (test_type) { case 0: wil_dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); break; - case 1: + case 1: wil_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); break; @@ -645,7 +645,34 @@ void dslashRef() { printfQuda("Test type not defined\n"); exit(-1); } - } else if((dslash_type == QUDA_TWISTED_MASS_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){ + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + switch (test_type) { + case 0: + clover_dslash(spinorRef->V(), hostGauge, hostCloverInv, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); + break; + case 1: + clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type, + dagger, inv_param.cpu_prec, gauge_param); + break; + case 2: + clover_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param); + break; + case 3: + clover_matpc(spinorTmp->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type, + dagger, inv_param.cpu_prec, gauge_param); + clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinorTmp->V(), inv_param.kappa, inv_param.matpc_type, + not_dagger, inv_param.cpu_prec, gauge_param); + break; + case 4: + clover_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param); + clover_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, not_dagger, + inv_param.cpu_prec, gauge_param); + break; + default: + printfQuda("Test type not defined\n"); + exit(-1); + } + } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { switch (test_type) { case 0: if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) @@ -697,7 +724,7 @@ void dslashRef() { } break; case 3: - if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){ + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { tm_matpc(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); tm_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, @@ -721,7 +748,7 @@ void dslashRef() { } break; case 4: - if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){ + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { tm_mat(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); tm_mat(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, @@ -748,6 +775,46 @@ void dslashRef() { printfQuda("Test type not defined\n"); exit(-1); } + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + switch (test_type) { + case 0: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_dslash(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 1: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_matpc(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 2: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 3: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { + tmc_matpc(spinorTmp->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + tmc_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); + } else + errorQuda("Not supported\n"); + break; + case 4: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { + tmc_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); + tmc_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param); + } else + errorQuda("Not supported\n"); + break; + default: + printfQuda("Test type not defined\n"); + exit(-1); + } } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ){ switch (test_type) { case 0: @@ -907,8 +974,6 @@ int main(int argc, char **argv) for (int i=0; i -static inline void xpay(Float *x, Float a, Float *y, int len) { - for (int i=0; i static inline void axpy(Float a, Float *x, Float *y, int len) { diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 9f239ba77e..2eabfe7587 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -57,6 +57,9 @@ extern double tol_hq; // heavy-quark tolerance for inverter extern QudaMassNormalization normalization; // mass normalization of Dirac operators extern QudaMatPCType matpc_type; // preconditioning type +extern double clover_coeff; +extern bool compute_clover; + extern int niter; // max solver iterations extern char latfile[]; @@ -184,12 +187,14 @@ int main(int argc, char **argv) for (int i=0; iVolume(); void *evenOut = spinorCheck; void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset); - + void *evenIn = spinorOut; void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset); - + tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param); } - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + tmc_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, + inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { @@ -499,14 +511,22 @@ int main(int argc, char **argv) } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) + errorQuda("Twisted mass solution type not supported"); + tmc_matpc(spinorCheck, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_WILSON_DSLASH) { + wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, + inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { @@ -544,18 +564,30 @@ int main(int argc, char **argv) ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) + errorQuda("Twisted mass solution type not supported"); + tmc_matpc(spinorTmp, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + tmc_matpc(spinorCheck, gauge, spinorTmp, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_matpc(spinorTmp, gauge, clover, clover_inv, spinorOut, inv_param.kappa, + inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorTmp, inv_param.kappa, + inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { errorQuda("Unsupported dslash_type"); } @@ -587,5 +619,12 @@ int main(int argc, char **argv) finalizeComms(); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (clover) free(clover); + if (clover_inv) free(clover_inv); + } + + for (int dir = 0; dir<4; dir++) free(gauge[dir]); + return 0; } diff --git a/tests/multigrid_invert_test.cpp b/tests/multigrid_invert_test.cpp index a3b88d4c11..8b842ec10f 100644 --- a/tests/multigrid_invert_test.cpp +++ b/tests/multigrid_invert_test.cpp @@ -72,7 +72,8 @@ extern QudaTwistFlavorType twist_flavor; extern void usage(char** ); -double clover_coeff = 1.0; +extern double clover_coeff; +extern bool compute_clover; namespace quda { extern void setTransferGPU(bool); @@ -171,6 +172,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) { inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.clover_coeff = clover_coeff; } inv_param.input_location = QUDA_CPU_FIELD_LOCATION; @@ -195,8 +197,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) { } } - inv_param.clover_coeff = clover_coeff; - inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; @@ -415,7 +415,7 @@ int main(int argc, char **argv) size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); - void *gauge[4], *clover_inv=0;//, *clover=0; + void *gauge[4], *clover=0, *clover_inv=0; for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*gSize); @@ -436,25 +436,15 @@ int main(int argc, char **argv) double norm = 0.1; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal - size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); + size_t cSize = inv_param.clover_cpu_prec; + clover = malloc(V*cloverSiteSize*cSize); clover_inv = malloc(V*cloverSiteSize*cSize); - construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec); - - // The uninverted clover term is only needed when solving the unpreconditioned - // system or when using "asymmetric" even/odd preconditioning. - int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE || - inv_param.solve_type == QUDA_NORMOP_PC_SOLVE); - int asymmetric = preconditioned && - (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || - inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); - if (!preconditioned) { - //clover = clover_inv; - clover_inv = NULL; - } else if (asymmetric) { // fake it by using the same random matrix - //clover = clover_inv; // for both clover and clover_inv - } else { - //clover = NULL; - } + if (!compute_clover) construct_clover_field(clover, norm, diag, inv_param.clover_cpu_prec); + + inv_param.compute_clover = compute_clover; + if (compute_clover) inv_param.return_clover = 1; + inv_param.compute_clover_inverse = 1; + inv_param.return_clover_inverse = 1; } void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls); @@ -472,12 +462,10 @@ int main(int argc, char **argv) // load the gauge field loadGaugeQuda((void*)gauge, &gauge_param); - // load the clover term, if desired - //if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); - // this line ensure that if we need to construct the clover inverse (in either the smoother or the solver) we do so if (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE || solve_type == QUDA_DIRECT_PC_SOLVE) inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); + inv_param.solve_type = solve_type; // restore actual solve_type we want to do // setup the multigrid solver @@ -577,11 +565,14 @@ int main(int argc, char **argv) endQuda(); // finalize the communications layer -#if defined(QMP_COMMS) - QMP_finalize_msg_passing(); -#elif defined(MPI_COMMS) - MPI_Finalize(); -#endif + finalizeComms(); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (clover) free(clover); + if (clover_inv) free(clover_inv); + } + + for (int dir = 0; dir<4; dir++) free(gauge[dir]); return 0; } diff --git a/tests/staggered_dslash_reference.cpp b/tests/staggered_dslash_reference.cpp index 6932e07c7b..ea845cb339 100644 --- a/tests/staggered_dslash_reference.cpp +++ b/tests/staggered_dslash_reference.cpp @@ -12,6 +12,7 @@ #include #include +#include extern void *memset(void *s, int c, size_t n); @@ -126,10 +127,7 @@ void Mat(sFloat *out, gFloat **fatlink, gFloat** longlink, sFloat *in, sFloat ka // full dslash operator dslashReference(outOdd, fatlink, longlink, inEven, 1, daggerBit); dslashReference(outEven, fatlink, longlink, inOdd, 0, daggerBit); - - // lastly apply the kappa term - xpay(in, -kappa, out, V*mySpinorSiteSize); -} + } void @@ -150,6 +148,9 @@ mat(void *out, void **fatlink, void** longlink, void *in, double kappa, int dagg Mat((float*)out, (float**)fatlink, (float**)longlink, (float*)in, (float)kappa, dagger_bit); } } + + // lastly apply the kappa term + xpay(in, -kappa, out, V*mySpinorSiteSize, sPrecision); } @@ -220,27 +221,19 @@ matdagmat(void *out, void **fatlink, void** longlink, void *in, double mass, int // Apply the even-odd preconditioned Dirac operator template -static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, sFloat kappa, - int daggerBit, QudaMatPCType matpc_type) { +static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, int dagger, QudaMatPCType matpc_type) { sFloat *tmp = (sFloat*)malloc(Vh*mySpinorSiteSize*sizeof(sFloat)); // full dslash operator if (matpc_type == QUDA_MATPC_EVEN_EVEN) { - dslashReference(tmp, fatlink, longlink, inEven, 1, daggerBit); - dslashReference(outEven, fatlink, longlink, tmp, 0, daggerBit); - - //dslashReference(outEven, fatlink, longlink, inEven, 1, daggerBit); + dslashReference(tmp, fatlink, longlink, inEven, 1, dagger); + dslashReference(outEven, fatlink, longlink, tmp, 0, dagger); } else { - dslashReference(tmp, fatlink, longlink, inEven, 0, daggerBit); - dslashReference(outEven, fatlink, longlink, tmp, 1, daggerBit); + dslashReference(tmp, fatlink, longlink, inEven, 0, dagger); + dslashReference(outEven, fatlink, longlink, tmp, 1, dagger); } - // lastly apply the kappa term - - sFloat kappa2 = -kappa*kappa; - xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize); - free(tmp); } @@ -252,18 +245,22 @@ staggered_matpc(void *outEven, void **fatlink, void**longlink, void *inEven, dou if (sPrecision == QUDA_DOUBLE_PRECISION) if (gPrecision == QUDA_DOUBLE_PRECISION) { - MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type); + MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type); } else{ - MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type); + MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type); } else { if (gPrecision == QUDA_DOUBLE_PRECISION){ - MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type); + MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, dagger_bit, matpc_type); }else{ - MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type); + MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, dagger_bit, matpc_type); } } + + // lastly apply the kappa term + double kappa2 = -kappa*kappa; + xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize, sPrecision); } #ifdef MULTI_GPU diff --git a/tests/test_util.cpp b/tests/test_util.cpp index 38c5e3cda5..44982bef69 100644 --- a/tests/test_util.cpp +++ b/tests/test_util.cpp @@ -1590,6 +1590,8 @@ bool verify_results = true; double mass = 0.1; double mu = 0.1; double anisotropy = 1.0; +double clover_coeff = 0.1; +bool compute_clover = false; double tol = 1e-7; double tol_hq = 0.; QudaTwistFlavorType twist_flavor = QUDA_TWIST_MINUS; @@ -1656,6 +1658,8 @@ void usage(char** argv ) printf(" --multishift # Whether to do a multi-shift solver test or not (default false)\n"); printf(" --mass # Mass of Dirac operator (default 0.1)\n"); printf(" --mu # Twisted-Mass of Dirac operator (default 0.1)\n"); + printf(" --compute-clover # Compute the clover field or use random numbers (default false)\n"); + printf(" --clover-coeff # Clover coefficient (default 1.0)\n"); printf(" --anisotropy # Temporal anisotropy factor (default 1.0)\n"); printf(" --mass-normalization # Mass normalization (kappa (default) / mass / asym-mass)\n"); printf(" --matpc # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n"); @@ -2150,6 +2154,34 @@ int process_command_line_option(int argc, char** argv, int* idx) goto out; } + if( strcmp(argv[i], "--compute-clover") == 0){ + if (i+1 >= argc){ + usage(argv); + } + if (strcmp(argv[i+1], "true") == 0){ + compute_clover = true; + }else if (strcmp(argv[i+1], "false") == 0){ + compute_clover = false; + }else{ + fprintf(stderr, "ERROR: invalid compute_clover type\n"); + exit(1); + } + + i++; + ret = 0; + goto out; + } + + if( strcmp(argv[i], "--clover-coeff") == 0){ + if (i+1 >= argc){ + usage(argv); + } + clover_coeff = atof(argv[i+1]); + i++; + ret = 0; + goto out; + } + if( strcmp(argv[i], "--mu") == 0){ if (i+1 >= argc){ usage(argv); diff --git a/tests/wilson_dslash_reference.cpp b/tests/wilson_dslash_reference.cpp index 752411a7b5..8d9c3baa87 100644 --- a/tests/wilson_dslash_reference.cpp +++ b/tests/wilson_dslash_reference.cpp @@ -4,6 +4,7 @@ #include + #include #include #include @@ -303,8 +304,7 @@ void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qu wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param); // lastly apply the kappa term - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V*spinorSiteSize); - else xpay((float*)in, -(float)kappa, (float*)out, V*spinorSiteSize); + xpay(in, -kappa, out, V*spinorSiteSize, precision); } void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, @@ -324,8 +324,7 @@ void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, twist_gamma5(tmp, in, dagger_bit, kappa, mu, flavor, V, QUDA_TWIST_GAMMA5_DIRECT, precision); // combine - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V*spinorSiteSize); - else xpay((float*)tmp, -(float)kappa, (float*)out, V*spinorSiteSize); + xpay(tmp, -kappa, (double*)out, V*spinorSiteSize, precision); free(tmp); } @@ -349,8 +348,7 @@ void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, // lastly apply the kappa term double kappa2 = -kappa*kappa; - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision); free(tmp); } @@ -401,11 +399,9 @@ void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu // lastly apply the kappa term double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision); } else { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)tmp, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(tmp, kappa2, outEven, Vh*spinorSiteSize, precision); } free(tmp); @@ -419,7 +415,7 @@ void ndegTwistGamma5(sFloat *out1, sFloat *out2, sFloat *in1, sFloat *in2, const sFloat a=0.0, b=0.0, d=0.0; if (twist == QUDA_TWIST_GAMMA5_DIRECT) { // applying the twist - a = 2.0 * kappa * mu; + a = 2.0 * kappa * mu; b = -2.0 * kappa * epsilon; d = 1.0; } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { // applying the inverse twist @@ -485,66 +481,60 @@ void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, if (!daggerBit) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { - wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { - wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } } else { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); } } if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { - wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { - wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); } // lastly apply the kappa term double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { - ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); + ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); } - if (precision == QUDA_DOUBLE_PRECISION){ - xpay((double*)inEven1, kappa2, (double*)outEven1, Vh*spinorSiteSize); - xpay((double*)inEven2, kappa2, (double*)outEven2, Vh*spinorSiteSize); - } - else{ - xpay((float*)inEven1, (float)kappa2, (float*)outEven1, Vh*spinorSiteSize); - xpay((float*)inEven2, (float)kappa2, (float*)outEven2, Vh*spinorSiteSize); - } + xpay(inEven1, kappa2, outEven1, Vh*spinorSiteSize, precision); + xpay(inEven2, kappa2, outEven2, Vh*spinorSiteSize, precision); free(tmp1); free(tmp2); @@ -555,48 +545,39 @@ void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void * { //V-4d volume and Vh=V/2 void *inEven1 = evenIn; - void *inEven2 = (char*)evenIn + precision*Vh*spinorSiteSize; + void *inEven2 = (char*)evenIn + precision*Vh*spinorSiteSize; void *inOdd1 = oddIn; - void *inOdd2 = (char*)oddIn + precision*Vh*spinorSiteSize; + void *inOdd2 = (char*)oddIn + precision*Vh*spinorSiteSize; void *outEven1 = evenOut; - void *outEven2 = (char*)evenOut + precision*Vh*spinorSiteSize; + void *outEven2 = (char*)evenOut + precision*Vh*spinorSiteSize; void *outOdd1 = oddOut; - void *outOdd2 = (char*)oddOut + precision*Vh*spinorSiteSize; + void *outOdd2 = (char*)oddOut + precision*Vh*spinorSiteSize; void *tmpEven1 = malloc(Vh*spinorSiteSize*precision); void *tmpEven2 = malloc(Vh*spinorSiteSize*precision); void *tmpOdd1 = malloc(Vh*spinorSiteSize*precision); - void *tmpOdd2 = malloc(Vh*spinorSiteSize*precision); + void *tmpOdd2 = malloc(Vh*spinorSiteSize*precision); // full dslash operator: - wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param); // apply the twist term - ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); + ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); ndeg_twist_gamma5(tmpOdd1, tmpOdd2, inOdd1, inOdd2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); // combine - if (precision == QUDA_DOUBLE_PRECISION){ - xpay((double*)tmpOdd1, -kappa, (double*)outOdd1, Vh*spinorSiteSize); - xpay((double*)tmpOdd2, -kappa, (double*)outOdd2, Vh*spinorSiteSize); - - xpay((double*)tmpEven1, -kappa, (double*)outEven1, Vh*spinorSiteSize); - xpay((double*)tmpEven2, -kappa, (double*)outEven2, Vh*spinorSiteSize); - } - else{ - xpay((float*)tmpOdd1, (float)(-kappa), (float*)outOdd1, Vh*spinorSiteSize); - xpay((float*)tmpOdd2, (float)(-kappa), (float*)outOdd2, Vh*spinorSiteSize); - - xpay((float*)tmpEven1, (float)(-kappa), (float*)outEven1, Vh*spinorSiteSize); - xpay((float*)tmpEven2, (float)(-kappa), (float*)outEven2, Vh*spinorSiteSize); - } + xpay(tmpOdd1, -kappa, outOdd1, Vh*spinorSiteSize, precision); + xpay(tmpOdd2, -kappa, outOdd2, Vh*spinorSiteSize, precision); + + xpay(tmpEven1, -kappa, outEven1, Vh*spinorSiteSize, precision); + xpay(tmpEven2, -kappa, outEven2, Vh*spinorSiteSize, precision); free(tmpOdd1); free(tmpOdd2); diff --git a/tests/wilson_dslash_reference.h b/tests/wilson_dslash_reference.h index e066613f05..0dc929a299 100644 --- a/tests/wilson_dslash_reference.h +++ b/tests/wilson_dslash_reference.h @@ -10,7 +10,7 @@ extern "C" { void wil_dslash(void *res, void **gauge, void *spinorField, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); - + void wil_mat(void *out, void **gauge, void *in, double kappa, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); @@ -20,22 +20,43 @@ extern "C" { void tm_dslash(void *res, void **gauge, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision sprecision, QudaGaugeParam ¶m); - + void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); void tm_matpc(void *out, void **gauge, void *in, double kappa, double mu, - QudaTwistFlavorType flavor, QudaMatPCType matpc_type, + QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); - void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, - double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, - QudaPrecision precision, QudaGaugeParam &gauge_param) ; + void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, + double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, + int daggerBit, QudaPrecision sprecision, QudaGaugeParam ¶m); + + void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, + QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, + double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, + QudaPrecision precision, QudaGaugeParam &gauge_param); void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, - QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); - - void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn, - double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn, + double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void apply_clover(void *out, void *clover, void *in, int parity, QudaPrecision precision); + + void clover_dslash(void *res, void **gauge, void *clover, void *spinorField, int oddBit, + int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); + + void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, + int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); #ifdef __cplusplus }