Skip to content

Commit

Permalink
Merge pull request #480 from lattice/feature/host-clover
Browse files Browse the repository at this point in the history
  • Loading branch information
mathiaswagner authored Jun 24, 2016
2 parents bc696cf + 8899390 commit f96193d
Show file tree
Hide file tree
Showing 29 changed files with 1,130 additions and 798 deletions.
10 changes: 10 additions & 0 deletions include/clover_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ namespace quda {
order = (precision == QUDA_DOUBLE_PRECISION) ?
QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER;
}

CloverFieldParam() : LatticeFieldParam(),
direct(true), inverse(true), clover(nullptr), norm(nullptr),
cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { }

CloverFieldParam(const CloverFieldParam &param) : LatticeFieldParam(param),
direct(param.direct), inverse(param.inverse),
clover(param.clover), norm(param.norm),
cloverInv(param.cloverInv), invNorm(param.invNorm),
twisted(param.twisted), mu2(param.mu2) { }
};

std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param);
Expand Down
165 changes: 122 additions & 43 deletions include/clover_field_order.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ namespace quda {
complex<Float> tmp(a[parity][idx], a[parity][idx+1]);
return tmp;
} else {
// requesting upper triangular so return conjuate transpose
// requesting upper triangular so return conjugate transpose
return conj(operator()(parity,x,s_col,s_row,c_col,c_row) );
}
}
Expand Down Expand Up @@ -252,65 +252,143 @@ namespace quda {
template <typename Float, int length, int N>
struct FloatNOrder {
typedef typename mapper<Float>::type RegType;
Float *clover[2];
float *norm[2];
typedef typename VectorType<Float,N>::type Vector;
static const int M=length/(N*2); // number of short vectors per chiral block
static const int block=length/2; // chiral block size

Float *clover;
float *norm;
size_t offset;
size_t norm_offset;
const int volumeCB;
const int stride;

const bool twisted;
const Float mu2;

FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()),
twisted(clover.Twisted()), mu2(clover.Mu2()) {
this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse));
this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2);
this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse));
this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2);
}

size_t bytes;
size_t norm_bytes;
void *backup_h; //! host memory for backing up the field when tuning
void *backup_norm_h; //! host memory for backing up norm when tuning

FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) :
offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))),
volumeCB(clover.VolumeCB()), stride(clover.Stride()),
twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()),
norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr)
{
this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse));
this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse));
}

bool Twisted() const {return twisted;}
Float Mu2() const {return mu2;}

__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
const int M=length/(N*2);
for (int chirality=0; chirality<2; chirality++) {
for (int i=0; i<M; i++) {
for (int j=0; j<N; j++) {
int intIdx = (chirality*M + i)*N + j; // internal dof index
int padIdx = intIdx / N;
copy(v[(chirality*M+i)*N+j], clover[parity][(padIdx*stride + x)*N + intIdx%N]);
if (sizeof(Float)==sizeof(short)) v[(chirality*M+i)*N+j] *= norm[parity][chirality*stride + x];
}
}
/**
@brief Load accessor for a single chiral block
@param[out] v Vector of loaded elements
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void load(RegType v[block], int x, int parity, int chirality) const {
#pragma unroll
for (int i=0; i<M; i++) {
// first do vectorized copy from memory
Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
// second do scalar copy converting into register type
#pragma unroll
for (int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
}

if (sizeof(Float)==sizeof(short))
#pragma unroll
for (int i=0; i<block; i++) v[i] *= norm[parity*norm_offset + chirality*stride + x];
}

__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
/**
@brief Store accessor for a single chiral block
@param[out] v Vector of elements to be stored
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void save(const RegType v[block], int x, int parity, int chirality) {
// find the norm of each chiral block
RegType scale[2];
RegType scale = 0.0;
if (sizeof(Float)==sizeof(short)) {
const int M = length/2;
for (int chi=0; chi<2; chi++) { // chirality
scale[chi] = 0.0;
for (int i=0; i<M; i++)
scale[chi] = fabs(v[chi*M+i]) > scale[chi] ? fabs(v[chi*M+i]) : scale[chi];
norm[parity][chi*stride + x] = scale[chi];
}
#pragma unroll
for (int i=0; i<block; i++) scale = fabs(v[i]) > scale ? fabs(v[i]) : scale;
norm[parity*norm_offset + chirality*stride + x] = scale;
}

const int M=length/(N*2);
for (int chirality=0; chirality<2; chirality++) {
for (int i=0; i<M; i++) {
for (int j=0; j<N; j++) {
int intIdx = (chirality*M + i)*N + j;
int padIdx = intIdx / N;
if (sizeof(Float)==sizeof(short))
copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j] / scale[chirality]);
else
copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j]);
}
}
#pragma unroll
for (int i=0; i<M; i++) {
Vector vecTmp;
// first do scalar copy converting into storage type and rescaling if necessary
if (sizeof(Float)==sizeof(short))
#pragma unroll
for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j] / scale);
else
#pragma unroll
for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j]);

// second do vectorized copy into memory
reinterpret_cast<Vector*>(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp;
}
}

/**
@brief Load accessor for the clover matrix
@param[out] v Vector of loaded elements
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
#pragma unroll
for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality);
}

/**
@brief Store accessor for the clover matrix
@param[out] v Vector of elements to be stored
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
#pragma unroll
for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality);
}

/**
@brief Backup the field to the host when tuning
*/
void save() {
if (backup_h) errorQuda("Already allocated host backup");
backup_h = safe_malloc(bytes);
cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
if (norm_bytes) {
backup_norm_h = safe_malloc(norm_bytes);
cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
}
checkCudaError();
}

/**
@brief Restore the field from the host after tuning
*/
void load() {
cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
host_free(backup_h);
backup_h = nullptr;
if (norm_bytes) {
cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
host_free(backup_norm_h);
backup_norm_h = nullptr;
}
checkCudaError();
}

size_t Bytes() const {
Expand All @@ -320,6 +398,7 @@ namespace quda {
}
};


/**
QDP ordering for clover fields
*/
Expand Down
4 changes: 1 addition & 3 deletions include/dirac_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ namespace quda {
cudaGaugeField *fatGauge; // used by staggered only
cudaGaugeField *longGauge; // used by staggered only
cudaCloverField *clover;
cudaCloverField *cloverInv;

double mu; // used by twisted mass only
double epsilon; //2nd tm parameter (used by twisted mass only)
Expand All @@ -49,7 +48,7 @@ namespace quda {

DiracParam()
: type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID),
dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0),
dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0),
tmp1(0), tmp2(0)
{

Expand Down Expand Up @@ -537,7 +536,6 @@ namespace quda {
double mu;
double epsilon;
cudaCloverField &clover;
cudaCloverField &cloverInv;
void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const;
void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in,
const QudaTwistGamma5Type twistType, const int parity) const;
Expand Down
3 changes: 1 addition & 2 deletions include/multigrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ namespace quda {
@param T[in] Transfer operator that defines the coarse space
@param gauge[in] Gauge field from fine grid
@param clover[in] Clover field on fine grid (optional)
@param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover)
@param kappa[in] Kappa parameter
@param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover)
@param matpc[in] The type of even-odd preconditioned fine-grid
Expand All @@ -338,7 +337,7 @@ namespace quda {
even-odd preconditioned and we coarsen the full operator.
*/
void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T,
const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv,
const cudaGaugeField &gauge, const cudaCloverField *clover,
double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc);

/**
Expand Down
5 changes: 5 additions & 0 deletions include/quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ extern "C" {
int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */
double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */

int compute_clover; /**< Whether to compute the clover field */
int compute_clover_inverse; /**< Whether to compute the clover inverse field */
int return_clover; /**< Whether to copy back the clover matrix field */
int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */

QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */

int sp_pad; /**< The padding to use for the fermion fields */
Expand Down
13 changes: 11 additions & 2 deletions lib/check_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,18 @@ void printQudaInvertParam(QudaInvertParam *param) {
#if defined INIT_PARAM
P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION);
P(compute_clover_trlog, 0);
P(compute_clover, 0);
P(compute_clover_inverse, 0);
P(return_clover, 0);
P(return_clover_inverse, 0);
#else
if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
P(compute_clover_trlog, QUDA_INVALID_PRECISION);
P(compute_clover, QUDA_INVALID_PRECISION);
P(compute_clover_inverse, QUDA_INVALID_PRECISION);
P(return_clover, QUDA_INVALID_PRECISION);
P(return_clover_inverse, QUDA_INVALID_PRECISION);
#endif
P(clover_order, QUDA_INVALID_CLOVER_ORDER);
P(cl_pad, INVALID_INT);
Expand Down
Loading

0 comments on commit f96193d

Please sign in to comment.