diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 84c88d21ef..81c23d2faf 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -444,13 +444,15 @@ namespace quda { #if __COMPUTE_CAPABILITY__ >= 200 const int hasPhase; const size_t phaseOffset; + void *backup_h; //! host memory for backing up the field when tuning + size_t bytes; #endif FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : reconstruct(u), volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()) #if __COMPUTE_CAPABILITY__ >= 200 , hasPhase((u.Reconstruct() == QUDA_RECONSTRUCT_9 || u.Reconstruct() == QUDA_RECONSTRUCT_13) ? 1 : 0), - phaseOffset(u.PhaseOffset()) + phaseOffset(u.PhaseOffset()), backup_h(0), bytes(u.Bytes()) #endif { if (gauge_) { gauge[0] = gauge_; gauge[1] = (Float*)((char*)gauge_ + u.Bytes()/2); @@ -467,7 +469,7 @@ namespace quda { : reconstruct(order.reconstruct), volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry) #if __COMPUTE_CAPABILITY__ >= 200 - , hasPhase(order.hasPhase), phaseOffset(order.phaseOffset) + , hasPhase(order.hasPhase), phaseOffset(order.phaseOffset), backup_h(0), bytes(order.bytes) #endif { gauge[0] = order.gauge[0]; @@ -618,6 +620,30 @@ namespace quda { } } + /** + used to backup the field to the host when tuning + */ + void save() { +#if __COMPUTE_CAPABILITY__ >= 200 + if (backup_h) errorQuda("Already allocated host backup"); + backup_h = safe_malloc(bytes); + cudaMemcpy(backup_h, gauge[0], bytes, cudaMemcpyDeviceToHost); + checkCudaError(); +#endif + } + + /** + restore the field from the host after tuning + */ + void load() { +#if __COMPUTE_CAPABILITY__ >= 200 + cudaMemcpy(gauge[0], backup_h, bytes, cudaMemcpyHostToDevice); + host_free(backup_h); + backup_h = 0; + checkCudaError(); +#endif + } + size_t Bytes() const { return reconLen * sizeof(Float); } }; diff --git a/include/quda.h b/include/quda.h index e55faa4bfe..d3b04ae250 100644 --- a/include/quda.h +++ b/include/quda.h @@ -635,31 +635,91 @@ extern "C" { /** - * Take a gauge field on the host, extend it and load it onto the device. - * Return a pointer to the extended gauge field. + * Take a gauge field on the host, load it onto the device and extend it. + * Return a pointer to the extended gauge field object. + * + * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor) + * @param param The parameters of the external field and the field to be created + * @return Pointer to the gauge field (cast as a void*) */ - void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param); - - void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param); + void* createExtendedGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param); - void saveGaugeField(void* outGauge, void* inGauge, QudaGaugeParam* param); + /** + * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. + * + * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor) + * @param param The parameters of the external field and the field to be created + * @return Pointer to the gauge field (cast as a void*) + */ + void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param); - void extendGaugeField(void* outGauge, void* inGauge); + /** + * Copy the QUDA gauge (matrix) field on the device to the CPU + * + * @param outGauge Pointer to the host gauge field + * @param inGauge Pointer to the device gauge field (QUDA device field) + * @param param The parameters of the host and device fields + */ + void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); + /** + * Take a gauge field on the device and copy to the extended gauge + * field. The precisions and reconstruct types can differ between + * the input and output field, but they must be compatible (same volume, geometry). + * + * @param outGauge Pointer to the output extended device gauge field (QUDA extended device field) + * @param inGauge Pointer to the input device gauge field (QUDA gauge field) + */ + void extendGaugeFieldQuda(void* outGauge, void* inGauge); /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * + * @param gauge Gauge field to be freed */ - void destroyQudaGaugeField(void* gauge); + void destroyGaugeFieldQuda(void* gauge); + /** + * Compute the clover field and its inverse from the resident gauge field. + * + * @param param The parameters of the clover field to create + */ void createCloverQuda(QudaInvertParam* param); - - void computeCloverTraceQuda(void* out, void* clover, int mu, int nu, int dim[4]); - + /** + * Compute the sigma trace field (part of clover force computation). + * All the pointers here are for QUDA native device objects. The + * precisions of all fields must match. This function requires that + * there is a persistent clover field. + * + * @param out Sigma trace field (QUDA device field, geometry = 1) + * @param dummy (not used) + * @param mu mu direction + * @param nu nu direction + * @param dim array of local field dimensions + */ + void computeCloverTraceQuda(void* out, void* dummy, int mu, int nu, int dim[4]); + + /** + * Compute the derivative of the clover term (part of clover force + * computation). All the pointers here are for QUDA native device + * objects. The precisions of all fields must match. + * + * @param out Clover derivative field (QUDA device field, geometry = 1) + * @param gauge Gauge field (extended QUDA device field, gemoetry = 4) + * @param oprod Matrix field (outer product) which is multiplied by the derivative + * @param mu mu direction + * @param nu nu direction + * @param coeff Coefficient of the clover derviative (including stepsize and clover coefficient) + * @param parity Parity for which we are computing + * @param param Gauge field meta data + * @param conjugate Whether to make the oprod field anti-hermitian prior to multiplication + */ void computeCloverDerivativeQuda(void* out, void* gauge, void* oprod, int mu, int nu, - double coeff, - QudaParity parity, QudaGaugeParam* param, int conjugate); + double coeff, + QudaParity parity, QudaGaugeParam* param, int conjugate); /** * Compute the quark-field outer product needed for gauge generation diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h index 85984e0ede..89294bace1 100644 --- a/include/quda_milc_interface.h +++ b/include/quda_milc_interface.h @@ -221,27 +221,118 @@ extern "C" { void* oprod[2]); + /** + * Evolve the gauge field by step size dt, using the momentum field + * I.e., Evalulate U(t+dt) = e(dt pi) U(t). All fields are CPU fields in MILC order. + * + * @param precision Precision of the field (2 - double, 1 - single) + * @param dt The integration step size step + * @param momentum The momentum field + * @param The gauge field to be updated + */ void qudaUpdateU(int precision, - double eps, - void* momentum, - void* link); - - void qudaCloverTrace(void* out, void* clover, int mu, int nu); - - - void qudaCloverDerivative(void* out, void* gauge, void* oprod, - int mu, int nu, double coeff, int precision, int parity, int conjugate); - - - void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision); - - void* qudaCreateGaugeField(void* gauge, int geometry, int precision); - - void qudaSaveGaugeField(void* gauge, void* inGauge); - + double eps, + void* momentum, + void* link); + + /** + * Compute the sigma trace field (part of clover force computation). + * All the pointers here are for QUDA native device objects. The + * precisions of all fields must match. This function requires that + * there is a persistent clover field. + * + * @param out Sigma trace field (QUDA device field, geometry = 1) + * @param dummy (not used) + * @param mu mu direction + * @param nu nu direction + */ + void qudaCloverTrace(void* out, + void* dummy, + int mu, + int nu); + + + /** + * Compute the derivative of the clover term (part of clover force + * computation). All the pointers here are for QUDA native device + * objects. The precisions of all fields must match. + * + * @param out Clover derivative field (QUDA device field, geometry = 1) + * @param gauge Gauge field (extended QUDA device field, gemoetry = 4) + * @param oprod Matrix field (outer product) which is multiplied by the derivative + * @param mu mu direction + * @param nu nu direction + * @param coeff Coefficient of the clover derviative (including stepsize and clover coefficient) + * @param precision Precision of the fields (2 = double, 1 = single) + * @param parity Parity for which we are computing + * @param conjugate Whether to make the oprod field anti-hermitian prior to multiplication + */ + void qudaCloverDerivative(void* out, + void* gauge, + void* oprod, + int mu, + int nu, + double coeff, + int precision, + int parity, + int conjugate); + + + /** + * Take a gauge field on the host, load it onto the device and extend it. + * Return a pointer to the extended gauge field object. + * + * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor) + * @param precision The precision of the fields (2 - double, 1 - single) + * @return Pointer to the gauge field (cast as a void*) + */ + void* qudaCreateExtendedGaugeField(void* gauge, + int geometry, + int precision); + + /** + * Take the QUDA resident gauge field and extend it. + * Return a pointer to the extended gauge field object. + * + * @param gauge The CPU gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor) + * @param precision The precision of the fields (2 - double, 1 - single) + * @return Pointer to the gauge field (cast as a void*) + */ + void* qudaResidentExtendedGaugeField(void* gauge, + int geometry, + int precision); + + /** + * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. + * + * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scaler, 4 - vector, 6 - tensor) + * @param precision The precision of the field to be created (2 - double, 1 - single) + * @return Pointer to the gauge field (cast as a void*) + */ + void* qudaCreateGaugeField(void* gauge, + int geometry, + int precision); + + /** + * Copy the QUDA gauge (matrix) field on the device to the CPU + * + * @param outGauge Pointer to the host gauge field + * @param inGauge Pointer to the device gauge field (QUDA device field) + */ + void qudaSaveGaugeField(void* gauge, + void* inGauge); + + /** + * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * + * @param gauge Gauge field to be freed + */ void qudaDestroyGaugeField(void* gauge); - + #ifdef __cplusplus } #endif diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu index 23dff8ba6c..ba78011939 100644 --- a/lib/clover_deriv_quda.cu +++ b/lib/clover_deriv_quda.cu @@ -4,49 +4,41 @@ #include #include #include +#include #include #include namespace quda { - + #ifdef GPU_CLOVER_DIRAC - - template - struct CloverDerivArg + + template + struct CloverDerivArg + { + int X[4]; + int border[4]; + int mu; + int nu; + typename RealTypeId::Type coeff; + int parity; + int volumeCB; + + Force force; + Gauge gauge; + Oprod oprod; + + bool conjugate; + + CloverDerivArg(const Force& force, const Gauge& gauge, const Oprod& oprod, const int *X, + int mu, int nu, double coeff, int parity, bool conjugate) : + mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.volumeCB), + force(force), gauge(gauge), oprod(oprod), conjugate(conjugate) { - int X[4]; - int border[4]; - int mu; - int nu; - typename RealTypeId::Type coeff; - int parity; - int volumeCB; - - Cmplx* gauge; - Cmplx* force; - Cmplx* oprod; - - int forceStride; - int gaugeStride; - int oprodStride; - - int forceOffset; - int gaugeOffset; - int oprodOffset; - - bool conjugate; - - CloverDerivArg(cudaGaugeField& force, cudaGaugeField& gauge, cudaGaugeField& oprod, int mu, int nu, double coeff, int parity, bool conjugate) : - mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.VolumeCB()), - force(reinterpret_cast(force.Gauge_p())), gauge(reinterpret_cast(gauge.Gauge_p())), oprod(reinterpret_cast(oprod.Gauge_p())), - forceStride(force.Stride()), gaugeStride(gauge.Stride()), oprodStride(oprod.Stride()), - forceOffset(force.Bytes()/(2*sizeof(Cmplx))), gaugeOffset(gauge.Bytes()/(2*sizeof(Cmplx))), oprodOffset(oprod.Bytes()/(2*sizeof(Cmplx))) - { - for(int dir=0; dir<4; ++dir) X[dir] = force.X()[dir]; - //for(int dir=0; dir<4; ++dir) border[dir] = commDimPartitioned(dir) ? 2 : 0; - for(int dir=0; dir<4; ++dir) border[dir] = 2; - } - }; + for(int dir=0; dir<4; ++dir) this->X[dir] = X[dir]; + //for(int dir=0; dir<4; ++dir) border[dir] = commDimPartitioned(dir) ? 2 : 0; + for(int dir=0; dir<4; ++dir) border[dir] = 2; + } + }; __device__ void getCoords(int x[4], int cb_index, const int X[4], int parity) { @@ -54,7 +46,6 @@ namespace quda { x[2] = (cb_index/(X[1]*X[0]/2)) % X[2]; x[1] = (cb_index/(X[0]/2)) % X[1]; x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1); - return; } @@ -64,248 +55,235 @@ namespace quda { for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i]; return (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0])/2; } + + + template + __global__ void + cloverDerivativeKernel(Arg arg) + { + typedef typename RealTypeId::Type real; + int index = threadIdx.x + blockIdx.x*blockDim.x; - template - __global__ void - cloverDerivativeKernel(const CloverDerivArg arg) - { - int index = threadIdx.x + blockIdx.x*blockDim.x; - - if(index >= arg.volumeCB) return; - - - int x[4]; - int y[4]; - int otherparity = (1-arg.parity); - getCoords(x, index, arg.X, arg.parity); - getCoords(y, index, arg.X, otherparity); - int X[4]; - for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir]; - - for(int dir=0; dir<4; ++dir){ - x[dir] += arg.border[dir]; - y[dir] += arg.border[dir]; - X[dir] += 2*arg.border[dir]; - } - - Cmplx* thisGauge = arg.gauge + arg.parity*arg.gaugeOffset; - Cmplx* otherGauge = arg.gauge + (otherparity)*arg.gaugeOffset; - - Cmplx* thisOprod = arg.oprod + arg.parity*arg.oprodOffset; + if(index >= arg.volumeCB) return; - const int& mu = arg.mu; - const int& nu = arg.nu; + int x[4]; + int y[4]; + int otherparity = (1-arg.parity); + getCoords(x, index, arg.X, arg.parity); + getCoords(y, index, arg.X, otherparity); + int X[4]; + for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir]; + + for(int dir=0; dir<4; ++dir){ + x[dir] += arg.border[dir]; + y[dir] += arg.border[dir]; + X[dir] += 2*arg.border[dir]; + } - Matrix thisForce; - Matrix otherForce; + const int& mu = arg.mu; + const int& nu = arg.nu; - // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x) - { - int d[4] = {0, 0, 0, 0}; + Matrix thisForce; + Matrix otherForce; - // load U(x)_(+mu) - Matrix U1; - loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X), - arg.gaugeStride, &U1); + // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x) + { + int d[4] = {0, 0, 0, 0}; + // load U(x)_(+mu) + Matrix U1; + arg.gauge.load((real*)(U1.data), linkIndex(x, d, X), mu, arg.parity); - // load U(x+mu)_(+nu) - Matrix U2; - d[mu]++; - loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X), - arg.gaugeStride, &U2); - d[mu]--; + // load U(x+mu)_(+nu) + Matrix U2; + d[mu]++; + arg.gauge.load((real*)(U2.data), linkIndex(x, d, X), nu, otherparity); + d[mu]--; - // load U(x+nu)_(+mu) - Matrix U3; - d[nu]++; - loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X), - arg.gaugeStride, &U3); - d[nu]--; + // load U(x+nu)_(+mu) + Matrix U3; + d[nu]++; + arg.gauge.load((real*)(U3.data), linkIndex(x, d, X), mu, otherparity); + d[nu]--; - // load U(x)_(+nu) - Matrix U4; - loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X), - arg.gaugeStride, &U4); + // load U(x)_(+nu) + Matrix U4; + arg.gauge.load((real*)(U4.data), linkIndex(x, d, X), nu, arg.parity); - // load Oprod - Matrix Oprod1; - loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1); + // load Oprod + Matrix Oprod1; + arg.oprod.load((real*)(Oprod1.data), linkIndex(x, d, X), 0, arg.parity); - if(isConjugate) Oprod1 -= conj(Oprod1); - thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1; + if(isConjugate) Oprod1 -= conj(Oprod1); + thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1; - Matrix Oprod2; - d[mu]++; d[nu]++; - loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod2); - d[mu]--; d[nu]--; + Matrix Oprod2; + d[mu]++; d[nu]++; + arg.oprod.load((real*)(Oprod2.data), linkIndex(x, d, X), 0, arg.parity); + d[mu]--; d[nu]--; - if(isConjugate) Oprod2 -= conj(Oprod2); + if(isConjugate) Oprod2 -= conj(Oprod2); - thisForce += U1*U2*Oprod2*conj(U3)*conj(U4); - - } + thisForce += U1*U2*Oprod2*conj(U3)*conj(U4); + } - { - int d[4] = {0, 0, 0, 0}; - // load U(x)_(+mu) - Matrix U1; - loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X), - arg.gaugeStride, &U1); - - // load U(x+mu)_(+nu) - Matrix U2; - d[mu]++; - loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X), - arg.gaugeStride, &U2); - d[mu]--; - - // load U(x+nu)_(+mu) - Matrix U3; - d[nu]++; - loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X), - arg.gaugeStride, &U3); - d[nu]--; - - // load U(x)_(+nu) - Matrix U4; - loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X), - arg.gaugeStride, &U4); - - // load opposite parity Oprod - Matrix Oprod3; - d[nu]++; - loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod3); - d[nu]--; - - if(isConjugate) Oprod3 -= conj(Oprod3); - otherForce = U1*U2*conj(U3)*Oprod3*conj(U4); - - // load Oprod(x+mu) - Matrix Oprod4; - d[mu]++; - loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod4); - d[mu]--; - - if(isConjugate) Oprod4 -= conj(Oprod4); - - otherForce += U1*Oprod4*U2*conj(U3)*conj(U4); - } + { + int d[4] = {0, 0, 0, 0}; + // load U(x)_(+mu) + Matrix U1; + arg.gauge.load((real*)(U1.data), linkIndex(y, d, X), mu, otherparity); + + // load U(x+mu)_(+nu) + Matrix U2; + d[mu]++; + arg.gauge.load((real*)(U2.data), linkIndex(y, d, X), nu, arg.parity); + d[mu]--; + + // load U(x+nu)_(+mu) + Matrix U3; + d[nu]++; + arg.gauge.load((real*)(U3.data), linkIndex(y, d, X), mu, arg.parity); + d[nu]--; + + // load U(x)_(+nu) + Matrix U4; + arg.gauge.load((real*)(U4.data), linkIndex(y, d, X), nu, otherparity); + + // load opposite parity Oprod + Matrix Oprod3; + d[nu]++; + arg.oprod.load((real*)(Oprod3.data), linkIndex(y, d, X), 0, arg.parity); + d[nu]--; + + if(isConjugate) Oprod3 -= conj(Oprod3); + otherForce = U1*U2*conj(U3)*Oprod3*conj(U4); + + // load Oprod(x+mu) + Matrix Oprod4; + d[mu]++; + arg.oprod.load((real*)(Oprod4.data), linkIndex(y, d, X), 0, arg.parity); + d[mu]--; + + if(isConjugate) Oprod4 -= conj(Oprod4); + + otherForce += U1*Oprod4*U2*conj(U3)*conj(U4); + } - // Lower leaf - // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x) - { - int d[4] = {0, 0, 0, 0}; - // load U(x-nu)(+nu) - Matrix U1; - d[nu]--; - loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X), - arg.gaugeStride, &U1); - d[nu]++; - - // load U(x-nu)(+mu) - Matrix U2; - d[nu]--; - loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X), - arg.gaugeStride, &U2); - d[nu]++; - - // load U(x+mu-nu)(nu) - Matrix U3; - d[mu]++; d[nu]--; - loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X), - arg.gaugeStride, &U3); - d[mu]--; d[nu]++; - - // load U(x)_(+mu) - Matrix U4; - loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X), - arg.gaugeStride, &U4); - - // load Oprod(x+mu) - Matrix Oprod1; - d[mu]++; - loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod1); - d[mu]--; - - if(isConjugate) Oprod1 -= conj(Oprod1); - - otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4); - - Matrix Oprod2; - d[nu]--; - loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod2); - d[nu]++; - - if(isConjugate) Oprod2 -= conj(Oprod2); - otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4); - } + // Lower leaf + // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x) + { + int d[4] = {0, 0, 0, 0}; + // load U(x-nu)(+nu) + Matrix U1; + d[nu]--; + arg.gauge.load((real*)(U1.data), linkIndex(y, d, X), nu, arg.parity); + d[nu]++; + + // load U(x-nu)(+mu) + Matrix U2; + d[nu]--; + arg.gauge.load((real*)(U2.data), linkIndex(y, d, X), mu, arg.parity); + d[nu]++; + + // load U(x+mu-nu)(nu) + Matrix U3; + d[mu]++; d[nu]--; + arg.gauge.load((real*)(U3.data), linkIndex(y, d, X), nu, otherparity); + d[mu]--; d[nu]++; + + // load U(x)_(+mu) + Matrix U4; + arg.gauge.load((real*)(U4.data), linkIndex(y, d, X), mu, otherparity); + + // load Oprod(x+mu) + Matrix Oprod1; + d[mu]++; + arg.oprod.load((real*)(Oprod1.data), linkIndex(y, d, X), 0, arg.parity); + d[mu]--; + + if(isConjugate) Oprod1 -= conj(Oprod1); + + otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4); + + Matrix Oprod2; + d[nu]--; + arg.oprod.load((real*)(Oprod2.data), linkIndex(y, d, X), 0, arg.parity); + d[nu]++; + + if(isConjugate) Oprod2 -= conj(Oprod2); + otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4); + } - { - int d[4] = {0, 0, 0, 0}; - // load U(x-nu)(+nu) - Matrix U1; - d[nu]--; - loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X), - arg.gaugeStride, &U1); - d[nu]++; - - // load U(x-nu)(+mu) - Matrix U2; - d[nu]--; - loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X), - arg.gaugeStride, &U2); - d[nu]++; - - // load U(x+mu-nu)(nu) - Matrix U3; - d[mu]++; d[nu]--; - loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X), - arg.gaugeStride, &U3); - d[mu]--; d[nu]++; - - // load U(x)_(+mu) - Matrix U4; - loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X), - arg.gaugeStride, &U4); - - - Matrix Oprod1; - d[mu]++; d[nu]--; - loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1); - d[mu]--; d[nu]++; - - if(isConjugate) Oprod1 -= conj(Oprod1); - thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4); - - Matrix Oprod4; - loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod4); - - if(isConjugate) Oprod4 -= conj(Oprod4); - thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4); - } + { + int d[4] = {0, 0, 0, 0}; + // load U(x-nu)(+nu) + Matrix U1; + d[nu]--; + arg.gauge.load((real*)(U1.data), linkIndex(x, d, X), nu, otherparity); + d[nu]++; + + // load U(x-nu)(+mu) + Matrix U2; + d[nu]--; + arg.gauge.load((real*)(U2.data), linkIndex(x, d, X), mu, otherparity); + d[nu]++; + + // load U(x+mu-nu)(nu) + Matrix U3; + d[mu]++; d[nu]--; + arg.gauge.load((real*)(U3.data), linkIndex(x, d, X), nu, arg.parity); + d[mu]--; d[nu]++; + + // load U(x)_(+mu) + Matrix U4; + arg.gauge.load((real*)(U4.data), linkIndex(x, d, X), mu, arg.parity); + + Matrix Oprod1; + d[mu]++; d[nu]--; + arg.oprod.load((real*)(Oprod1.data), linkIndex(x, d, X), 0, arg.parity); + d[mu]--; d[nu]++; + + if(isConjugate) Oprod1 -= conj(Oprod1); + thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4); + + Matrix Oprod4; + arg.oprod.load((real*)(Oprod4.data), linkIndex(x, d, X), 0, arg.parity); + + if(isConjugate) Oprod4 -= conj(Oprod4); + thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4); + } - thisForce *= arg.coeff; - otherForce *= arg.coeff; - + thisForce *= arg.coeff; + otherForce *= arg.coeff; - // Write to array - { - appendMatrixToArray(thisForce, index, arg.forceStride, arg.force + arg.parity*arg.forceOffset); - appendMatrixToArray(otherForce, index, arg.forceStride, arg.force + otherparity*arg.forceOffset); - } - return; - } // cloverDerivativeKernel - - template + // Write to array + { + Matrix F; + arg.force.load((real*)(F.data), index, 0, arg.parity); + F += thisForce; + arg.force.save((real*)(F.data), index, 0, arg.parity); + } + + { + Matrix F; + arg.force.load((real*)(F.data), index, 0, otherparity); + F += otherForce; + arg.force.save((real*)(F.data), index, 0, otherparity); + } + + return; + } // cloverDerivativeKernel + + + template class CloverDerivative : public Tunable { - + private: - CloverDerivArg arg; + Arg arg; const GaugeField &meta; unsigned int sharedBytesPerThread() const { return 0; } @@ -315,14 +293,19 @@ namespace quda { bool tuneGridDim() const { return false; } public: - CloverDerivative(const CloverDerivArg &arg, const GaugeField &meta) + CloverDerivative(const Arg &arg, const GaugeField &meta) : arg(arg), meta(meta) { - writeAuxString("threads=%d,prec=%lu,stride=%d,geometery=%d",arg.volumeCB,sizeof(Complex)/2,arg.forceOffset); + writeAuxString("threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d", + arg.volumeCB,sizeof(Complex)/2,arg.force.stride,arg.gauge.stride,arg.oprod.stride); } virtual ~CloverDerivative() {} void apply(const cudaStream_t &stream){ +#if __COMPUTE_CAPABILITY__ >= 200 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); +#else // tuning not supported on Tesla architecture + TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity()); +#endif if(arg.conjugate){ cloverDerivativeKernel<<>>(arg); }else{ @@ -330,65 +313,98 @@ namespace quda { } } // apply - void preTune(){} - void postTune(){} - - long long flops() const { - return 0; - } + // The force field is updated so we must preserve its initial state +#if __COMPUTE_CAPABILITY__ >= 200 + void preTune() { arg.force.save(); } + void postTune(){ arg.force.load(); } +#endif - long long bytes() const { return 0; } + long long flops() const { return 0; } + long long bytes() const { return (16*arg.gauge.Bytes() + 8*arg.oprod.Bytes() + 4*arg.force.Bytes()) * arg.volumeCB; } TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); } }; - - // FIXME - the Tunable class isn't used here + template - void cloverDerivative(cudaGaugeField &out, - cudaGaugeField& gauge, - cudaGaugeField& oprod, - int mu, int nu, double coeff, int parity, - int conjugate) - { - typedef typename ComplexTypeId::Type Complex; - CloverDerivArg arg(out, gauge, oprod, mu, nu, coeff, parity, conjugate); -// CloverDerivative cloverDerivative(arg); -// cloverDerivative.apply(0); - dim3 blockDim(128, 1, 1); - dim3 gridDim((arg.volumeCB + blockDim.x-1)/blockDim.x, 1, 1); - if(conjugate){ - cloverDerivativeKernel<<>>(arg); - }else{ - cloverDerivativeKernel<<>>(arg); + void cloverDerivative(cudaGaugeField &out, + cudaGaugeField &gauge, + cudaGaugeField &oprod, + int mu, int nu, double coeff, int parity, + int conjugate) { + + if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) + errorQuda("Force field does not support reconstruction"); + + if (out.Order() != oprod.Order()) + errorQuda("Force and Oprod orders must match"); + + if (out.Reconstruct() != QUDA_RECONSTRUCT_NO) + errorQuda("Force field does not support reconstruction"); + + typedef typename ComplexTypeId::Type Complex; + + if (out.Order() == QUDA_FLOAT2_GAUGE_ORDER){ + typedef FloatNOrder F; + typedef FloatNOrder O; + + if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){ + if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){ + typedef FloatNOrder G; + typedef CloverDerivArg Arg; + Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate); + CloverDerivative deriv(arg, gauge); + deriv.apply(0); + }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){ + typedef FloatNOrder G; + typedef CloverDerivArg Arg; + Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate); + CloverDerivative deriv(arg, gauge); + deriv.apply(0); + }else{ + errorQuda("Reconstruction type %d not supported",gauge.Reconstruct()); + } + + }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){ + if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){ + typedef FloatNOrder G; + typedef CloverDerivArg Arg; + Arg arg(F(out), G(gauge), O(oprod), out.X(), mu, nu, coeff, parity, conjugate); + CloverDerivative deriv(arg, gauge); + deriv.apply(0); + }else{ + errorQuda("Reconstruction type %d not supported",gauge.Reconstruct()); + } } - } - -#endif + } else { + errorQuda("Force order %d not supported"); + } // force / oprod order + } +#endif // GPU_CLOVER - void cloverDerivative(cudaGaugeField &out, - cudaGaugeField& gauge, - cudaGaugeField& oprod, - int mu, int nu, double coeff, QudaParity parity, int conjugate) - { +void cloverDerivative(cudaGaugeField &out, + cudaGaugeField& gauge, + cudaGaugeField& oprod, + int mu, int nu, double coeff, QudaParity parity, int conjugate) +{ #ifdef GPU_CLOVER_DIRAC - assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY); - assert(out.Geometry() == QUDA_SCALAR_GEOMETRY); + assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY); + assert(out.Geometry() == QUDA_SCALAR_GEOMETRY); - int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1; + int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1; - if(out.Precision() == QUDA_DOUBLE_PRECISION){ - cloverDerivative(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate); - } else if (out.Precision() == QUDA_SINGLE_PRECISION){ - cloverDerivative(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate); - } else { - errorQuda("Precision %d not supported", out.Precision()); - } - return; + if(out.Precision() == QUDA_DOUBLE_PRECISION){ + cloverDerivative(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate); + } else if (out.Precision() == QUDA_SINGLE_PRECISION){ + cloverDerivative(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate); + } else { + errorQuda("Precision %d not supported", out.Precision()); + } + return; #else - errorQuda("Clover has not been built"); + errorQuda("Clover has not been built"); #endif - } +} } // namespace quda diff --git a/lib/clover_trace_quda.cu b/lib/clover_trace_quda.cu index e641d6295e..7e6331729f 100644 --- a/lib/clover_trace_quda.cu +++ b/lib/clover_trace_quda.cu @@ -219,9 +219,8 @@ namespace quda { void apply(const cudaStream_t &stream){ if(location == QUDA_CUDA_FIELD_LOCATION){ #if (__COMPUTE_CAPABILITY__ >= 200) - dim3 blockDim(128, 1, 1); - dim3 gridDim((arg.clover1.volumeCB + blockDim.x - 1)/blockDim.x, 1, 1); - cloverSigmaTraceKernel<<>>(arg); + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + cloverSigmaTraceKernel<<>>(arg); #else errorQuda("cloverSigmaTrace not supported on pre-Fermi architecture"); #endif @@ -240,7 +239,7 @@ namespace quda { } long long flops() const { return 0; } // Fix this - long long bytes() const { return 0; } // Fix this + long long bytes() const { return (arg.clover1.Bytes() + arg.gauge.Bytes()) * arg.clover1.volumeCB; } }; // CloverSigmaTrace @@ -252,7 +251,6 @@ namespace quda { CloverTraceArg arg(clover1, clover2, gauge, dir1, dir2); CloverSigmaTrace traceCompute(arg, meta, location); traceCompute.apply(0); - cudaDeviceSynchronize(); return; } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 6c1702cea4..27939a557f 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -677,7 +677,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (!initialized) errorQuda("QUDA not initialized"); if (!h_clover && !h_clovinv) { - printfQuda("clover_coeff: %lf\n", inv_param->clover_coeff); if(inv_param->clover_coeff != 0){ device_calc = true; }else{ @@ -3788,7 +3787,6 @@ void createCloverQuda(QudaInvertParam* invertParam) profileCloverCreate.Start(QUDA_PROFILE_TOTAL); profileCloverCreate.Start(QUDA_PROFILE_INIT); if(!cloverPrecise){ - printfQuda("About to create cloverPrecise\n"); CloverFieldParam cloverParam; cloverParam.nDim = 4; for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir]; @@ -3898,7 +3896,7 @@ void createCloverQuda(QudaInvertParam* invertParam) return; } -void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param) +void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) { GaugeFieldParam gParam(0,*param); @@ -3928,7 +3926,7 @@ void* createGaugeField(void* gauge, int geometry, QudaGaugeParam* param) } -void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){ +void saveGaugeFieldQuda(void* gauge, void* inGauge, QudaGaugeParam* param){ cudaGaugeField* cudaGauge = reinterpret_cast(inGauge); @@ -3946,7 +3944,7 @@ void saveGaugeField(void* gauge, void* inGauge, QudaGaugeParam* param){ } -void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param) +void* createExtendedGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) { profileExtendedGauge.Start(QUDA_PROFILE_TOTAL); @@ -4028,7 +4026,7 @@ void* createExtendedGaugeField(void* gauge, int geometry, QudaGaugeParam* param) } // extend field on the GPU -void extendGaugeField(void* out, void* in){ +void extendGaugeFieldQuda(void* out, void* in){ cudaGaugeField* inGauge = reinterpret_cast(in); cudaGaugeField* outGauge = reinterpret_cast(out); @@ -4042,7 +4040,7 @@ void extendGaugeField(void* out, void* in){ -void destroyQudaGaugeField(void* gauge){ +void destroyGaugeFieldQuda(void* gauge){ cudaGaugeField* g = reinterpret_cast(gauge); delete g; } diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index c815ebbbed..ad89e51c2c 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -878,14 +878,25 @@ void qudaInvert(int external_precision, #ifdef GPU_CLOVER_DIRAC -void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision) +static inline void* createExtendedGaugeField(void* gauge, int geometry, int precision, int resident) { qudamilc_called(__func__); QudaPrecision qudaPrecision = (precision==2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, qudaPrecision, (geometry==1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); + gaugeParam.use_resident_gauge = resident ? 1 : 0; + + return createExtendedGaugeFieldQuda(gauge, geometry, &gaugeParam); +} + +void* qudaCreateExtendedGaugeField(void* gauge, int geometry, int precision) +{ + return createExtendedGaugeField(gauge, geometry, precision, 0); +} - return createExtendedGaugeField(gauge, geometry, &gaugeParam); +void* qudaResidentExtendedGaugeField(void* gauge, int geometry, int precision) +{ + return createExtendedGaugeField(gauge, geometry, precision, 1); } @@ -896,7 +907,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision) QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, qudaPrecision, (geometry==1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); - return createGaugeField(gauge, geometry, &gaugeParam); + return createGaugeFieldQuda(gauge, geometry, &gaugeParam); } @@ -905,7 +916,7 @@ void qudaSaveGaugeField(void* gauge, void* inGauge) qudamilc_called(__func__); cudaGaugeField* cudaGauge = reinterpret_cast(inGauge); QudaGaugeParam gaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); - saveGaugeField(gauge, inGauge, &gaugeParam); + saveGaugeFieldQuda(gauge, inGauge, &gaugeParam); qudamilc_called(__func__); return; } @@ -914,7 +925,7 @@ void qudaSaveGaugeField(void* gauge, void* inGauge) void qudaDestroyGaugeField(void* gauge) { qudamilc_called(__func__); - destroyQudaGaugeField(gauge); + destroyGaugeFieldQuda(gauge); qudamilc_called(__func__); return; } @@ -994,7 +1005,7 @@ void setGaugeParams(QudaGaugeParam &gaugeParam, const int dim[4], QudaInvertArgs void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args, - int external_precision, int quda_precision, double kappa) { + int external_precision, int quda_precision, double kappa, double reliable_delta) { const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; @@ -1014,7 +1025,7 @@ void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args, invertParam.dagger = QUDA_DAG_NO; invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; invertParam.gcrNkrylov = 30; - invertParam.reliable_delta = 1e-1; + invertParam.reliable_delta = reliable_delta; invertParam.maxiter = inv_args.max_iter; invertParam.cuda_prec_precondition = device_precision_sloppy; @@ -1069,7 +1080,7 @@ void qudaLoadCloverField(int external_precision, double *trlog) { QudaInvertParam invertParam = newQudaInvertParam(); - setInvertParam(invertParam, inv_args, external_precision, quda_precision, 0.0); + setInvertParam(invertParam, inv_args, external_precision, quda_precision, 0.0, 0.0); invertParam.solution_type = solution_type; invertParam.solve_type = solve_type; invertParam.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC; @@ -1132,8 +1143,10 @@ void qudaCloverInvert(int external_precision, qudaLoadCloverField(external_precision, quda_precision, inv_args, clover, cloverInverse, QUDA_MAT_SOLUTION, QUDA_DIRECT_PC_SOLVE, clover_coeff, 0, 0); + double reliable_delta = 1e-1; + QudaInvertParam invertParam = newQudaInvertParam(); - setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa); + setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta); invertParam.residual_type = static_cast(0); invertParam.residual_type = (target_residual != 0) ? static_cast ( invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : invertParam.residual_type; invertParam.residual_type = (target_fermilab_residual != 0) ? static_cast (invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : invertParam.residual_type; @@ -1186,8 +1199,10 @@ void qudaCloverMultishiftInvert(int external_precision, } } + // if doing a pure double-precision multi-shift solve don't use reliable updates + double reliable_delta = (inv_args.mixed_precision == 1 || quda_precision == 1) ? 1e-1 : 0.0; QudaInvertParam invertParam = newQudaInvertParam(); - setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa); + setInvertParam(invertParam, inv_args, external_precision, quda_precision, kappa, reliable_delta); invertParam.residual_type = QUDA_L2_RELATIVE_RESIDUAL; invertParam.num_offset = num_offsets; for(int i=0; i