Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/host clover #480

Merged
merged 24 commits into from
Jun 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
153d44a
Implemented reference clover application operator and enabled true Wi…
maddyscientist Jun 4, 2016
af6d23a
Fixed cmake bug in dslash_test and invert_test compilation
maddyscientist Jun 4, 2016
b6de874
Fixed bugs in clover application: dslash_test now supports full clove…
maddyscientist Jun 10, 2016
6fd83bb
Fixed staggered reference dslash compiliation
maddyscientist Jun 10, 2016
da0baba
Set off-diagonal clover terms to be non-zero in clover invert test to…
maddyscientist Jun 10, 2016
a174af0
Added host routines for twisted-clover
AlexVaq Jun 12, 2016
9b1795e
Fixed bug in tuning of clover inversion if source destination are the…
maddyscientist Jun 15, 2016
5f028db
clover::FloatNOrder now uses vectorized load/store for improved perfo…
maddyscientist Jun 15, 2016
2a6a941
Updated clover inversion preTune/postTune for direct pointer access i…
maddyscientist Jun 15, 2016
ae3e7ac
Set clover norm to 0.1, as 0.2 leads to poorly conditioned clover mat…
maddyscientist Jun 15, 2016
017a67b
Clover inversion now uses grid-wide reduction rather than atomics to …
maddyscientist Jun 16, 2016
cab5e73
Display last error when autotuning fails
maddyscientist Jun 16, 2016
a671b9c
Fixed warning in last commit
maddyscientist Jun 16, 2016
9da9306
Merged separate clover and clover inverse fields for twisted-clover f…
maddyscientist Jun 17, 2016
3a8139c
Added default and copy constructors for CloverFieldParam
maddyscientist Jun 18, 2016
b1d088a
Significant cleanup of loadCloverQuda to minimize redundant code. Ad…
maddyscientist Jun 18, 2016
0a64e6d
In dslash_test, clover now matches twisted-clover in that the clover …
maddyscientist Jun 18, 2016
8ae1373
Added new parameters QudaInvertParam::compute_clover and QudaInvertPa…
maddyscientist Jun 21, 2016
d2207e2
Added new command-line parameters to unit tests for enabling QUDA-sid…
maddyscientist Jun 21, 2016
807a2c0
invert_test now supports --clover-coeff and --compute-clover command …
maddyscientist Jun 21, 2016
86034f9
Added missing --clover-coeff support to dslash_test
maddyscientist Jun 21, 2016
d011bf2
Added support for new clover testing in multigrid_invert_test
maddyscientist Jun 21, 2016
38fda50
Added chiral-block-level accessor for clover::FloatNOrder to try to g…
maddyscientist Jun 21, 2016
8899390
Added missing guard around clover construction
maddyscientist Jun 23, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/clover_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ namespace quda {
order = (precision == QUDA_DOUBLE_PRECISION) ?
QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER;
}

CloverFieldParam() : LatticeFieldParam(),
direct(true), inverse(true), clover(nullptr), norm(nullptr),
cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { }

CloverFieldParam(const CloverFieldParam &param) : LatticeFieldParam(param),
direct(param.direct), inverse(param.inverse),
clover(param.clover), norm(param.norm),
cloverInv(param.cloverInv), invNorm(param.invNorm),
twisted(param.twisted), mu2(param.mu2) { }
};

std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param);
Expand Down
165 changes: 122 additions & 43 deletions include/clover_field_order.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ namespace quda {
complex<Float> tmp(a[parity][idx], a[parity][idx+1]);
return tmp;
} else {
// requesting upper triangular so return conjuate transpose
// requesting upper triangular so return conjugate transpose
return conj(operator()(parity,x,s_col,s_row,c_col,c_row) );
}
}
Expand Down Expand Up @@ -252,65 +252,143 @@ namespace quda {
template <typename Float, int length, int N>
struct FloatNOrder {
typedef typename mapper<Float>::type RegType;
Float *clover[2];
float *norm[2];
typedef typename VectorType<Float,N>::type Vector;
static const int M=length/(N*2); // number of short vectors per chiral block
static const int block=length/2; // chiral block size

Float *clover;
float *norm;
size_t offset;
size_t norm_offset;
const int volumeCB;
const int stride;

const bool twisted;
const Float mu2;

FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()),
twisted(clover.Twisted()), mu2(clover.Mu2()) {
this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse));
this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2);
this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse));
this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2);
}

size_t bytes;
size_t norm_bytes;
void *backup_h; //! host memory for backing up the field when tuning
void *backup_norm_h; //! host memory for backing up norm when tuning

FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) :
offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))),
volumeCB(clover.VolumeCB()), stride(clover.Stride()),
twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()),
norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr)
{
this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse));
this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse));
}

bool Twisted() const {return twisted;}
Float Mu2() const {return mu2;}

__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
const int M=length/(N*2);
for (int chirality=0; chirality<2; chirality++) {
for (int i=0; i<M; i++) {
for (int j=0; j<N; j++) {
int intIdx = (chirality*M + i)*N + j; // internal dof index
int padIdx = intIdx / N;
copy(v[(chirality*M+i)*N+j], clover[parity][(padIdx*stride + x)*N + intIdx%N]);
if (sizeof(Float)==sizeof(short)) v[(chirality*M+i)*N+j] *= norm[parity][chirality*stride + x];
}
}
/**
@brief Load accessor for a single chiral block
@param[out] v Vector of loaded elements
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void load(RegType v[block], int x, int parity, int chirality) const {
#pragma unroll
for (int i=0; i<M; i++) {
// first do vectorized copy from memory
Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
// second do scalar copy converting into register type
#pragma unroll
for (int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
}

if (sizeof(Float)==sizeof(short))
#pragma unroll
for (int i=0; i<block; i++) v[i] *= norm[parity*norm_offset + chirality*stride + x];
}

__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
/**
@brief Store accessor for a single chiral block
@param[out] v Vector of elements to be stored
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void save(const RegType v[block], int x, int parity, int chirality) {
// find the norm of each chiral block
RegType scale[2];
RegType scale = 0.0;
if (sizeof(Float)==sizeof(short)) {
const int M = length/2;
for (int chi=0; chi<2; chi++) { // chirality
scale[chi] = 0.0;
for (int i=0; i<M; i++)
scale[chi] = fabs(v[chi*M+i]) > scale[chi] ? fabs(v[chi*M+i]) : scale[chi];
norm[parity][chi*stride + x] = scale[chi];
}
#pragma unroll
for (int i=0; i<block; i++) scale = fabs(v[i]) > scale ? fabs(v[i]) : scale;
norm[parity*norm_offset + chirality*stride + x] = scale;
}

const int M=length/(N*2);
for (int chirality=0; chirality<2; chirality++) {
for (int i=0; i<M; i++) {
for (int j=0; j<N; j++) {
int intIdx = (chirality*M + i)*N + j;
int padIdx = intIdx / N;
if (sizeof(Float)==sizeof(short))
copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j] / scale[chirality]);
else
copy(clover[parity][(padIdx*stride + x)*N + intIdx%N], v[(chirality*M+i)*N+j]);
}
}
#pragma unroll
for (int i=0; i<M; i++) {
Vector vecTmp;
// first do scalar copy converting into storage type and rescaling if necessary
if (sizeof(Float)==sizeof(short))
#pragma unroll
for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j] / scale);
else
#pragma unroll
for (int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[i*N+j]);

// second do vectorized copy into memory
reinterpret_cast<Vector*>(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp;
}
}

/**
@brief Load accessor for the clover matrix
@param[out] v Vector of loaded elements
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void load(RegType v[length], int x, int parity) const {
#pragma unroll
for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality);
}

/**
@brief Store accessor for the clover matrix
@param[out] v Vector of elements to be stored
@param[in] x Checkerboarded site index
@param[in] parity Field parity
@param[in] chirality Chiral block index
*/
__device__ __host__ inline void save(const RegType v[length], int x, int parity) {
#pragma unroll
for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality);
}

/**
@brief Backup the field to the host when tuning
*/
void save() {
if (backup_h) errorQuda("Already allocated host backup");
backup_h = safe_malloc(bytes);
cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
if (norm_bytes) {
backup_norm_h = safe_malloc(norm_bytes);
cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
}
checkCudaError();
}

/**
@brief Restore the field from the host after tuning
*/
void load() {
cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
host_free(backup_h);
backup_h = nullptr;
if (norm_bytes) {
cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
host_free(backup_norm_h);
backup_norm_h = nullptr;
}
checkCudaError();
}

size_t Bytes() const {
Expand All @@ -320,6 +398,7 @@ namespace quda {
}
};


/**
QDP ordering for clover fields
*/
Expand Down
4 changes: 1 addition & 3 deletions include/dirac_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ namespace quda {
cudaGaugeField *fatGauge; // used by staggered only
cudaGaugeField *longGauge; // used by staggered only
cudaCloverField *clover;
cudaCloverField *cloverInv;

double mu; // used by twisted mass only
double epsilon; //2nd tm parameter (used by twisted mass only)
Expand All @@ -49,7 +48,7 @@ namespace quda {

DiracParam()
: type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID),
dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0),
dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0),
tmp1(0), tmp2(0)
{

Expand Down Expand Up @@ -537,7 +536,6 @@ namespace quda {
double mu;
double epsilon;
cudaCloverField &clover;
cudaCloverField &cloverInv;
void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const;
void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in,
const QudaTwistGamma5Type twistType, const int parity) const;
Expand Down
3 changes: 1 addition & 2 deletions include/multigrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ namespace quda {
@param T[in] Transfer operator that defines the coarse space
@param gauge[in] Gauge field from fine grid
@param clover[in] Clover field on fine grid (optional)
@param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover)
@param kappa[in] Kappa parameter
@param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover)
@param matpc[in] The type of even-odd preconditioned fine-grid
Expand All @@ -338,7 +337,7 @@ namespace quda {
even-odd preconditioned and we coarsen the full operator.
*/
void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T,
const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv,
const cudaGaugeField &gauge, const cudaCloverField *clover,
double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc);

/**
Expand Down
5 changes: 5 additions & 0 deletions include/quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ extern "C" {
int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */
double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */

int compute_clover; /**< Whether to compute the clover field */
int compute_clover_inverse; /**< Whether to compute the clover inverse field */
int return_clover; /**< Whether to copy back the clover matrix field */
int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */

QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */

int sp_pad; /**< The padding to use for the fermion fields */
Expand Down
13 changes: 11 additions & 2 deletions lib/check_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,18 @@ void printQudaInvertParam(QudaInvertParam *param) {
#if defined INIT_PARAM
P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION);
P(compute_clover_trlog, 0);
P(compute_clover, 0);
P(compute_clover_inverse, 0);
P(return_clover, 0);
P(return_clover_inverse, 0);
#else
if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION)
param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy;
P(compute_clover_trlog, QUDA_INVALID_PRECISION);
P(compute_clover, QUDA_INVALID_PRECISION);
P(compute_clover_inverse, QUDA_INVALID_PRECISION);
P(return_clover, QUDA_INVALID_PRECISION);
P(return_clover_inverse, QUDA_INVALID_PRECISION);
#endif
P(clover_order, QUDA_INVALID_CLOVER_ORDER);
P(cl_pad, INVALID_INT);
Expand Down
Loading