From 153d44a616e6640ae61a9d0edbdfe50e4adef76a Mon Sep 17 00:00:00 2001 From: M Clark Date: Sat, 4 Jun 2016 14:54:54 -0700 Subject: [PATCH 01/24] Implemented reference clover application operator and enabled true Wilson-clover unit testing in dslash_test and invert_test. Some minor anciliiary clean up of reference blas as the same time. --- tests/CMakeLists.txt | 6 +- tests/Makefile | 6 +- tests/blas_reference.cpp | 12 +++ tests/blas_reference.h | 2 + tests/clover_reference.cpp | 142 +++++++++++++++++++++++++ tests/domain_wall_dslash_reference.cpp | 54 +++------- tests/dslash_test.cpp | 34 +++++- tests/dslash_util.h | 6 -- tests/invert_test.cpp | 25 ++++- tests/wilson_dslash_reference.cpp | 117 +++++++++----------- tests/wilson_dslash_reference.h | 31 ++++-- 11 files changed, 299 insertions(+), 136 deletions(-) create mode 100644 tests/clover_reference.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4c2a793310..70825f96fd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,10 +27,10 @@ endif() #define tests if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_DOMAIN_WALL}) - cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp) + cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp) target_link_libraries(dslash_test ${TEST_LIBS} ) - cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp) target_link_libraries(invert_test ${TEST_LIBS}) endif() @@ -46,7 +46,7 @@ if(${QUDA_DIRAC_STAGGERED}) endif() if(${QUDA_MULTIGRID}) - cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(multigrid_invert_test multigrid_invert_test.cpp wilson_dslash_reference.cpp clover_reference.cpp domain_wall_dslash_reference.cpp blas_reference.cpp) target_link_libraries(multigrid_invert_test ${TEST_LIBS}) cuda_add_executable(multigrid_benchmark_test multigrid_benchmark_test.cu) diff --git a/tests/Makefile b/tests/Makefile index c37e85ddc8..13479c38e1 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -56,13 +56,13 @@ TESTS = su3_test pack_test blas_test dslash_test invert_test \ all: $(TESTS) -dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o domain_wall_dslash_reference.o misc.o $(QUDA) +dslash_test: dslash_test.o test_util.o gtest-all.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -invert_test: invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) +invert_test: invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) +multigrid_invert_test: multigrid_invert_test.o test_util.o wilson_dslash_reference.o clover_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA) diff --git a/tests/blas_reference.cpp b/tests/blas_reference.cpp index 1651734150..056926d1ef 100644 --- a/tests/blas_reference.cpp +++ b/tests/blas_reference.cpp @@ -50,6 +50,18 @@ double norm_2(void *v, int len, QudaPrecision precision) { else return norm2((float*)v, len); } +// performs the operation y[i] = x[i] + a*y[i] +template +static inline void xpay(Float *x, Float a, Float *y, int len) { + for (int i=0; i +#include +#include +#include + +#include +#include +#include +#include + + +/** + @brief Apply the clover matrix field + @param[out] out Result field (single parity) + @param[in] clover Clover-matrix field (full field) + @param[in] in Input field (single parity) + @param[in] parity Parity to which we are applying the clover field + */ +template +void cloverReference(sFloat *out, cFloat *clover, sFloat *in, int parity) { + int nSpin = 4; + int nColor = 3; + int N = nSpin * nColor / 2; + int chiralBlock = (nSpin/2) * (nSpin/2) * nColor * nColor; + + for (int i = 0; i < Vh; i++) { + std::complex *In = reinterpret_cast*>(&in[i*nSpin*nColor*2]); + std::complex *Out = reinterpret_cast*>(&out[i*nSpin*nColor*2]); + for (int i=0; i *L = reinterpret_cast*>(&clover[((parity*Vh + i)*2 + chi)*chiralBlock+N]); + + for (int s_col=0; s_col(out), static_cast(clover), static_cast(in), parity); + case QUDA_SINGLE_PRECISION: + cloverReference(static_cast(out), static_cast(clover), static_cast(in), parity); + default: + errorQuda("Unsupported precision %d", precision); + } + +} + +void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, + int dagger, QudaPrecision precision, QudaGaugeParam ¶m) { + void *tmp = malloc(Vh*spinorSiteSize*precision); + + wil_dslash(tmp, gauge, in, parity, dagger, precision, param); + apply_clover(out, clover, tmp, parity, precision); + + free(tmp); +} + +// Apply the even-odd preconditioned Wilson-clover operator +void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + double kappa2 = -kappa*kappa; + void *tmp = malloc(Vh*spinorSiteSize*precision); + + switch(matpc_type) { + case QUDA_MATPC_EVEN_EVEN: + wil_dslash(tmp, gauge, in, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + wil_dslash(tmp, gauge, out, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC: + wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover, in, 0, precision); + xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + case QUDA_MATPC_ODD_ODD: + wil_dslash(tmp, gauge, in, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + wil_dslash(tmp, gauge, out, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + case QUDA_MATPC_ODD_ODD_ASYMMETRIC: + wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover, in, 1, precision); + xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + default: + errorQuda("Unsupoorted matpc=%d", matpc_type); + } + + free(tmp); +} + +// Apply the full Wilson-clover operator +void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, + int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + void *inEven = in; + void *inOdd = (char*)in + Vh*spinorSiteSize*precision; + void *outEven = out; + void *outOdd = (char*)out + Vh*spinorSiteSize*precision; + + clover_dslash(outOdd, gauge, clover, inEven, 1, dagger, precision, gauge_param); + clover_dslash(outEven, gauge, clover, inOdd, 0, dagger, precision, gauge_param); + + // lastly apply the kappa term + xpay(in, -kappa, out, V*spinorSiteSize, precision); +} diff --git a/tests/domain_wall_dslash_reference.cpp b/tests/domain_wall_dslash_reference.cpp index 2b5ce2dd90..a2e53ff9a8 100644 --- a/tests/domain_wall_dslash_reference.cpp +++ b/tests/domain_wall_dslash_reference.cpp @@ -16,12 +16,6 @@ using namespace quda; -void xpay(void *x, double a, void *y, int length, QudaPrecision precision) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)x, a, (double*)y, length); - else xpay((float*)x, (float)a, (float*)y, length); -} - - // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. // @@ -839,17 +833,12 @@ void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, if (precision == QUDA_DOUBLE_PRECISION) { if (zero_initialize) dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm); else dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm); - for(int xs = 0 ; xs < Ls ; xs++) - { - xpay((double*)in + Vh*spinorSiteSize*xs, kappa[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } } else { if (zero_initialize) dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm); else dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm); - for(int xs = 0 ; xs < Ls ; xs++) - { - xpay((float*)in + Vh*spinorSiteSize*xs, (float)(kappa[xs]), (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } + } + for(int xs = 0 ; xs < Ls ; xs++) { + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa[xs], (char*)out + precision*Vh*spinorSiteSize*xs, Vh*spinorSiteSize, precision); } } @@ -921,10 +910,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c mdw_dslash_5(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outOdd + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outOdd + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } mdw_dslash_4_pre(tmp, gauge, inOdd, 1, dagger, precision, gauge_param, mferm, b5, c5, true); @@ -932,10 +919,8 @@ void mdw_mat(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c mdw_dslash_5(tmp, gauge, inEven, 0, dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)tmp + Vh*spinorSiteSize*xs, -kappa_b[xs], (double*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)tmp + Vh*spinorSiteSize*xs, -(float)kappa_b[xs], (float*)outEven + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, -kappa_b[xs], (char*)outEven + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } free(kappa5); @@ -1040,10 +1025,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); dslash_5_inv(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, kappa_mdwf); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (symmetric && dagger) { dslash_5_inv(tmp, gauge, in, parity[1], dagger, precision, gauge_param, mferm, kappa_mdwf); @@ -1053,10 +1036,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_4_pre(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) - xpay((double*)in + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else - xpay((float*)in + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)in + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (!symmetric && !dagger) { mdw_dslash_4_pre(out, gauge, in, parity[1], dagger, precision, gauge_param, mferm, b5, c5, true); @@ -1066,8 +1047,8 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(out, gauge, tmp, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true); for(int xs = 0 ; xs < Ls ; xs++) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); } } else if (!symmetric && dagger) { dslash_4_4d(out, gauge, in, parity[0], dagger, precision, gauge_param, mferm); @@ -1076,11 +1057,10 @@ void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa dslash_4_4d(tmp, gauge, out, parity[1], dagger, precision, gauge_param, mferm); mdw_dslash_4_pre(out, gauge, tmp, parity[0], dagger, precision, gauge_param, mferm, b5, c5, true); mdw_dslash_5(tmp, gauge, in, parity[0], dagger, precision, gauge_param, mferm, kappa5, true); - for(int xs = 0 ; xs < Ls ; xs++) - { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize); - } + for(int xs = 0 ; xs < Ls ; xs++) { + xpay((char*)tmp + precision*Vh*spinorSiteSize*xs, kappa2[xs], (char*)out + precision*Vh*spinorSiteSize*xs, + Vh*spinorSiteSize, precision); + } } else { errorQuda("Unsupported matpc_type=%d dagger=%d", matpc_type, dagger); } diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index a84957f955..d18f2a9f2a 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -618,13 +618,12 @@ void dslashRef() { printfQuda("Calculating reference implementation..."); fflush(stdout); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || - dslash_type == QUDA_WILSON_DSLASH) { + if (dslash_type == QUDA_WILSON_DSLASH) { switch (test_type) { case 0: wil_dslash(spinorRef->V(), hostGauge, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); break; - case 1: + case 1: wil_matpc(spinorRef->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); break; @@ -645,7 +644,34 @@ void dslashRef() { printfQuda("Test type not defined\n"); exit(-1); } - } else if((dslash_type == QUDA_TWISTED_MASS_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){ + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + switch (test_type) { + case 0: + clover_dslash(spinorTmp->V(), hostGauge, hostClover, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); + break; + case 1: + clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type, + dagger, inv_param.cpu_prec, gauge_param); + break; + case 2: + clover_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param); + break; + case 3: + clover_matpc(spinorTmp->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type, + dagger, inv_param.cpu_prec, gauge_param); + clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinorTmp->V(), inv_param.kappa, inv_param.matpc_type, + not_dagger, inv_param.cpu_prec, gauge_param); + break; + case 4: + clover_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, dagger, inv_param.cpu_prec, gauge_param); + clover_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, not_dagger, + inv_param.cpu_prec, gauge_param); + break; + default: + printfQuda("Test type not defined\n"); + exit(-1); + } + } else if ((dslash_type == QUDA_TWISTED_MASS_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { switch (test_type) { case 0: if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) diff --git a/tests/dslash_util.h b/tests/dslash_util.h index 821535beac..a848a8a559 100644 --- a/tests/dslash_util.h +++ b/tests/dslash_util.h @@ -22,12 +22,6 @@ static inline void ax(Float *dst, Float a, Float *x, int cnt) { dst[i] = a * x[i]; } -// performs the operation y[i] = x[i] + a*y[i] -template -static inline void xpay(Float *x, Float a, Float *y, int len) { - for (int i=0; i static inline void axpy(Float a, Float *x, Float *y, int len) { diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 9f239ba77e..4a1f9e3216 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -424,11 +424,16 @@ int main(int argc, char **argv) inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_matpc(spinorTmp, gauge, clover, clover_inv, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0, + inv_param.cpu_prec, gauge_param); + clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, + inv_param.cpu_prec, gauge_param); } else { printfQuda("Domain wall not supported for multi-shift\n"); exit(-1); @@ -464,8 +469,10 @@ int main(int argc, char **argv) tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param); } - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { @@ -504,9 +511,12 @@ int main(int argc, char **argv) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, + } else if (dslash_type == QUDA_WILSON_DSLASH) { + wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, + inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { @@ -551,11 +561,16 @@ int main(int argc, char **argv) inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); - } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + clover_matpc(spinorTmp, gauge, clover, clover_inv, spinorOut, inv_param.kappa, + inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + clover_matpc(spinorCheck, gauge, clover, clover_inv, spinorTmp, inv_param.kappa, + inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { errorQuda("Unsupported dslash_type"); } diff --git a/tests/wilson_dslash_reference.cpp b/tests/wilson_dslash_reference.cpp index 752411a7b5..8d9c3baa87 100644 --- a/tests/wilson_dslash_reference.cpp +++ b/tests/wilson_dslash_reference.cpp @@ -4,6 +4,7 @@ #include + #include #include #include @@ -303,8 +304,7 @@ void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qu wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param); // lastly apply the kappa term - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V*spinorSiteSize); - else xpay((float*)in, -(float)kappa, (float*)out, V*spinorSiteSize); + xpay(in, -kappa, out, V*spinorSiteSize, precision); } void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, @@ -324,8 +324,7 @@ void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, twist_gamma5(tmp, in, dagger_bit, kappa, mu, flavor, V, QUDA_TWIST_GAMMA5_DIRECT, precision); // combine - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V*spinorSiteSize); - else xpay((float*)tmp, -(float)kappa, (float*)out, V*spinorSiteSize); + xpay(tmp, -kappa, (double*)out, V*spinorSiteSize, precision); free(tmp); } @@ -349,8 +348,7 @@ void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, // lastly apply the kappa term double kappa2 = -kappa*kappa; - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision); free(tmp); } @@ -401,11 +399,9 @@ void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu // lastly apply the kappa term double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(inEven, kappa2, outEven, Vh*spinorSiteSize, precision); } else { - if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, kappa2, (double*)outEven, Vh*spinorSiteSize); - else xpay((float*)tmp, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); + xpay(tmp, kappa2, outEven, Vh*spinorSiteSize, precision); } free(tmp); @@ -419,7 +415,7 @@ void ndegTwistGamma5(sFloat *out1, sFloat *out2, sFloat *in1, sFloat *in2, const sFloat a=0.0, b=0.0, d=0.0; if (twist == QUDA_TWIST_GAMMA5_DIRECT) { // applying the twist - a = 2.0 * kappa * mu; + a = 2.0 * kappa * mu; b = -2.0 * kappa * epsilon; d = 1.0; } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { // applying the inverse twist @@ -485,66 +481,60 @@ void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, if (!daggerBit) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { - wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { - wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(outEven1, outEven2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } } else { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { ndeg_twist_gamma5(tmp1, tmp2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, outEven1, outEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); } } if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { - wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 1, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 0, daggerBit, precision, gauge_param); } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { - wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); - wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp1, gauge, inEven1, 0, daggerBit, precision, gauge_param); + wil_dslash(tmp2, gauge, inEven2, 0, daggerBit, precision, gauge_param); ndeg_twist_gamma5(tmp1, tmp2, tmp1, tmp2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); - wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, tmp1, 1, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, tmp2, 1, daggerBit, precision, gauge_param); } // lastly apply the kappa term double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { - ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); + ndeg_twist_gamma5(inEven1, inEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); } - if (precision == QUDA_DOUBLE_PRECISION){ - xpay((double*)inEven1, kappa2, (double*)outEven1, Vh*spinorSiteSize); - xpay((double*)inEven2, kappa2, (double*)outEven2, Vh*spinorSiteSize); - } - else{ - xpay((float*)inEven1, (float)kappa2, (float*)outEven1, Vh*spinorSiteSize); - xpay((float*)inEven2, (float)kappa2, (float*)outEven2, Vh*spinorSiteSize); - } + xpay(inEven1, kappa2, outEven1, Vh*spinorSiteSize, precision); + xpay(inEven2, kappa2, outEven2, Vh*spinorSiteSize, precision); free(tmp1); free(tmp2); @@ -555,48 +545,39 @@ void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void * { //V-4d volume and Vh=V/2 void *inEven1 = evenIn; - void *inEven2 = (char*)evenIn + precision*Vh*spinorSiteSize; + void *inEven2 = (char*)evenIn + precision*Vh*spinorSiteSize; void *inOdd1 = oddIn; - void *inOdd2 = (char*)oddIn + precision*Vh*spinorSiteSize; + void *inOdd2 = (char*)oddIn + precision*Vh*spinorSiteSize; void *outEven1 = evenOut; - void *outEven2 = (char*)evenOut + precision*Vh*spinorSiteSize; + void *outEven2 = (char*)evenOut + precision*Vh*spinorSiteSize; void *outOdd1 = oddOut; - void *outOdd2 = (char*)oddOut + precision*Vh*spinorSiteSize; + void *outOdd2 = (char*)oddOut + precision*Vh*spinorSiteSize; void *tmpEven1 = malloc(Vh*spinorSiteSize*precision); void *tmpEven2 = malloc(Vh*spinorSiteSize*precision); void *tmpOdd1 = malloc(Vh*spinorSiteSize*precision); - void *tmpOdd2 = malloc(Vh*spinorSiteSize*precision); + void *tmpOdd2 = malloc(Vh*spinorSiteSize*precision); // full dslash operator: - wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param); - wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param); + wil_dslash(outOdd1, gauge, inEven1, 1, daggerBit, precision, gauge_param); + wil_dslash(outOdd2, gauge, inEven2, 1, daggerBit, precision, gauge_param); - wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param); - wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven1, gauge, inOdd1, 0, daggerBit, precision, gauge_param); + wil_dslash(outEven2, gauge, inOdd2, 0, daggerBit, precision, gauge_param); // apply the twist term - ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); + ndeg_twist_gamma5(tmpEven1, tmpEven2, inEven1, inEven2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); ndeg_twist_gamma5(tmpOdd1, tmpOdd2, inOdd1, inOdd2, daggerBit, kappa, mu, epsilon, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); // combine - if (precision == QUDA_DOUBLE_PRECISION){ - xpay((double*)tmpOdd1, -kappa, (double*)outOdd1, Vh*spinorSiteSize); - xpay((double*)tmpOdd2, -kappa, (double*)outOdd2, Vh*spinorSiteSize); - - xpay((double*)tmpEven1, -kappa, (double*)outEven1, Vh*spinorSiteSize); - xpay((double*)tmpEven2, -kappa, (double*)outEven2, Vh*spinorSiteSize); - } - else{ - xpay((float*)tmpOdd1, (float)(-kappa), (float*)outOdd1, Vh*spinorSiteSize); - xpay((float*)tmpOdd2, (float)(-kappa), (float*)outOdd2, Vh*spinorSiteSize); - - xpay((float*)tmpEven1, (float)(-kappa), (float*)outEven1, Vh*spinorSiteSize); - xpay((float*)tmpEven2, (float)(-kappa), (float*)outEven2, Vh*spinorSiteSize); - } + xpay(tmpOdd1, -kappa, outOdd1, Vh*spinorSiteSize, precision); + xpay(tmpOdd2, -kappa, outOdd2, Vh*spinorSiteSize, precision); + + xpay(tmpEven1, -kappa, outEven1, Vh*spinorSiteSize, precision); + xpay(tmpEven2, -kappa, outEven2, Vh*spinorSiteSize, precision); free(tmpOdd1); free(tmpOdd2); diff --git a/tests/wilson_dslash_reference.h b/tests/wilson_dslash_reference.h index e066613f05..4b270df7a8 100644 --- a/tests/wilson_dslash_reference.h +++ b/tests/wilson_dslash_reference.h @@ -10,7 +10,7 @@ extern "C" { void wil_dslash(void *res, void **gauge, void *spinorField, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); - + void wil_mat(void *out, void **gauge, void *in, double kappa, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); @@ -20,22 +20,33 @@ extern "C" { void tm_dslash(void *res, void **gauge, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision sprecision, QudaGaugeParam ¶m); - + void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); void tm_matpc(void *out, void **gauge, void *in, double kappa, double mu, - QudaTwistFlavorType flavor, QudaMatPCType matpc_type, + QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); - void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, - double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, - QudaPrecision precision, QudaGaugeParam &gauge_param) ; + void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, + double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, + QudaPrecision precision, QudaGaugeParam &gauge_param); void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, - QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); - - void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn, - double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void tm_ndeg_mat(void *evenOut, void* oddOut, void **gauge, void *evenIn, void *oddIn, + double kappa, double mu, double epsilon, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void apply_clover(void *out, void *clover, void *in, int parity, QudaPrecision precision); + + void clover_dslash(void *res, void **gauge, void *clover, void *spinorField, int oddBit, + int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); + + void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, + int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); #ifdef __cplusplus } From af6d23a8c3f2ff97c326dde547da35c70c202ef5 Mon Sep 17 00:00:00 2001 From: M Clark Date: Sat, 4 Jun 2016 14:57:06 -0700 Subject: [PATCH 02/24] Fixed cmake bug in dslash_test and invert_test compilation --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 70825f96fd..a6d1a36c3e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -26,7 +26,7 @@ endif() #define tests -if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_DOMAIN_WALL}) +if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_CLOVER} OR ${QUDA_DIRAC_TWISTED_MASS} OR ${QUDA_DIRAC_TWISTED_CLOVER} OR ${QUDA_DIRAC_DOMAIN_WALL}) cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp) target_link_libraries(dslash_test ${TEST_LIBS} ) From b6de8746264a7478c9b601ae18c936f7734aa7d3 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 9 Jun 2016 21:38:46 -0700 Subject: [PATCH 03/24] Fixed bugs in clover application: dslash_test now supports full clover testing (part 3 of #19). --- include/clover_field_order.h | 2 +- tests/CMakeLists.txt | 2 +- tests/clover_reference.cpp | 71 ++++++++++++++++++++++++++---------- tests/dslash_test.cpp | 6 +-- 4 files changed, 56 insertions(+), 25 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 73a3112398..91d964e482 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -144,7 +144,7 @@ namespace quda { complex tmp(a[parity][idx], a[parity][idx+1]); return tmp; } else { - // requesting upper triangular so return conjuate transpose + // requesting upper triangular so return conjugate transpose return conj(operator()(parity,x,s_col,s_row,c_col,c_row) ); } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a6d1a36c3e..04bea130e1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,7 +27,7 @@ endif() #define tests if(${QUDA_DIRAC_WILSON} OR ${QUDA_DIRAC_CLOVER} OR ${QUDA_DIRAC_TWISTED_MASS} OR ${QUDA_DIRAC_TWISTED_CLOVER} OR ${QUDA_DIRAC_DOMAIN_WALL}) - cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp) + cuda_add_executable(dslash_test dslash_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp) target_link_libraries(dslash_test ${TEST_LIBS} ) cuda_add_executable(invert_test invert_test.cpp wilson_dslash_reference.cpp domain_wall_dslash_reference.cpp clover_reference.cpp blas_reference.cpp) diff --git a/tests/clover_reference.cpp b/tests/clover_reference.cpp index 30e3cb4699..881f383e4e 100644 --- a/tests/clover_reference.cpp +++ b/tests/clover_reference.cpp @@ -20,36 +20,36 @@ template void cloverReference(sFloat *out, cFloat *clover, sFloat *in, int parity) { int nSpin = 4; int nColor = 3; - int N = nSpin * nColor / 2; - int chiralBlock = (nSpin/2) * (nSpin/2) * nColor * nColor; + int N = nColor * nSpin / 2; + int chiralBlock = N + 2*(N-1)*N/2; - for (int i = 0; i < Vh; i++) { + for (int i=0; i *In = reinterpret_cast*>(&in[i*nSpin*nColor*2]); std::complex *Out = reinterpret_cast*>(&out[i*nSpin*nColor*2]); - for (int i=0; i *L = reinterpret_cast*>(&clover[((parity*Vh + i)*2 + chi)*chiralBlock+N]); + std::complex *L = reinterpret_cast*>(&D[N]); for (int s_col=0; s_col(out), static_cast(clover), static_cast(in), parity); + break; case QUDA_SINGLE_PRECISION: cloverReference(static_cast(out), static_cast(clover), static_cast(in), parity); + break; default: errorQuda("Unsupported precision %d", precision); } @@ -95,29 +97,47 @@ void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void switch(matpc_type) { case QUDA_MATPC_EVEN_EVEN: - wil_dslash(tmp, gauge, in, 1, dagger, precision, gauge_param); - apply_clover(out, clover_inv, tmp, 1, precision); - wil_dslash(tmp, gauge, out, 0, dagger, precision, gauge_param); - apply_clover(out, clover_inv, tmp, 0, precision); + if (!dagger) { + wil_dslash(tmp, gauge, in, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + wil_dslash(tmp, gauge, out, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + } else { + apply_clover(tmp, clover_inv, in, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + } xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC: wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param); apply_clover(tmp, clover_inv, out, 1, precision); wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); apply_clover(tmp, clover, in, 0, precision); xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + break; case QUDA_MATPC_ODD_ODD: - wil_dslash(tmp, gauge, in, 0, dagger, precision, gauge_param); - apply_clover(out, clover_inv, tmp, 0, precision); - wil_dslash(tmp, gauge, out, 1, dagger, precision, gauge_param); - apply_clover(out, clover_inv, tmp, 1, precision); + if (!dagger) { + wil_dslash(tmp, gauge, in, 0, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 0, precision); + wil_dslash(tmp, gauge, out, 1, dagger, precision, gauge_param); + apply_clover(out, clover_inv, tmp, 1, precision); + } else { + apply_clover(tmp, clover_inv, in, 1, precision); + wil_dslash(out, gauge, tmp, 0, dagger, precision, gauge_param); + apply_clover(tmp, clover_inv, out, 0, precision); + wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); + } xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; case QUDA_MATPC_ODD_ODD_ASYMMETRIC: wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param); apply_clover(tmp, clover_inv, out, 0, precision); wil_dslash(out, gauge, tmp, 1, dagger, precision, gauge_param); apply_clover(tmp, clover, in, 1, precision); xpay(tmp, kappa2, out, Vh*spinorSiteSize, precision); + break; default: errorQuda("Unsupoorted matpc=%d", matpc_type); } @@ -129,14 +149,25 @@ void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + void *tmp = malloc(V*spinorSiteSize*precision); + void *inEven = in; void *inOdd = (char*)in + Vh*spinorSiteSize*precision; void *outEven = out; void *outOdd = (char*)out + Vh*spinorSiteSize*precision; + void *tmpEven = tmp; + void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision; + + // Odd part + wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param); + apply_clover(tmpOdd, clover, inOdd, 1, precision); - clover_dslash(outOdd, gauge, clover, inEven, 1, dagger, precision, gauge_param); - clover_dslash(outEven, gauge, clover, inOdd, 0, dagger, precision, gauge_param); + // Even part + wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param); + apply_clover(tmpEven, clover, inEven, 0, precision); // lastly apply the kappa term - xpay(in, -kappa, out, V*spinorSiteSize, precision); + xpay(tmp, -kappa, out, V*spinorSiteSize, precision); + + free(tmp); } diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index d18f2a9f2a..8424818f08 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -321,10 +321,10 @@ void init(int argc, char **argv) { construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); } - spinor->Source(QUDA_RANDOM_SOURCE); + spinor->Source(QUDA_RANDOM_SOURCE, 0); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - double norm = 0.0; // clover components are random numbers in the range (-norm, norm) + double norm = 1.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal if (test_type == 2 || test_type == 4) { @@ -647,7 +647,7 @@ void dslashRef() { } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { switch (test_type) { case 0: - clover_dslash(spinorTmp->V(), hostGauge, hostClover, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); + clover_dslash(spinorRef->V(), hostGauge, hostCloverInv, spinor->V(), parity, dagger, inv_param.cpu_prec, gauge_param); break; case 1: clover_matpc(spinorRef->V(), hostGauge, hostClover, hostCloverInv, spinor->V(), inv_param.kappa, inv_param.matpc_type, From 6fd83bb115a47b3da9a5202f4028451277503856 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 10 Jun 2016 02:03:08 -0700 Subject: [PATCH 04/24] Fixed staggered reference dslash compiliation --- tests/CMakeLists.txt | 4 +-- tests/Makefile | 2 +- tests/staggered_dslash_reference.cpp | 39 +++++++++++++--------------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 04bea130e1..bfd8d08ef8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -38,10 +38,10 @@ cuda_add_executable(deflation_test deflation_test.cpp wilson_dslash_reference.cp target_link_libraries(deflation_test ${TEST_LIBS}) if(${QUDA_DIRAC_STAGGERED}) - cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp staggered_dslash_reference.cpp) + cuda_add_executable(staggered_dslash_test staggered_dslash_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) target_link_libraries(staggered_dslash_test ${TEST_LIBS}) - cuda_add_executable(staggered_invert_test staggered_invert_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) + cuda_add_executable(staggered_invert_test staggered_invert_test.cpp staggered_dslash_reference.cpp blas_reference.cpp) target_link_libraries(staggered_invert_test ${TEST_LIBS}) endif() diff --git a/tests/Makefile b/tests/Makefile index 13479c38e1..f948fa13ba 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -71,7 +71,7 @@ multigrid_benchmark_test: multigrid_benchmark_test.o test_util.o misc.o $(QUDA) deflation_test: deflation_test.o test_util.o wilson_dslash_reference.o domain_wall_dslash_reference.o blas_reference.o misc.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) -staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o $(QUDA) +staggered_dslash_test: staggered_dslash_test.o gtest-all.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA) $(CXX) $(LDFLAGS) $^ -o $@ $(LDFLAGS) staggered_invert_test: staggered_invert_test.o test_util.o staggered_dslash_reference.o misc.o blas_reference.o $(QUDA) diff --git a/tests/staggered_dslash_reference.cpp b/tests/staggered_dslash_reference.cpp index 6932e07c7b..ea845cb339 100644 --- a/tests/staggered_dslash_reference.cpp +++ b/tests/staggered_dslash_reference.cpp @@ -12,6 +12,7 @@ #include #include +#include extern void *memset(void *s, int c, size_t n); @@ -126,10 +127,7 @@ void Mat(sFloat *out, gFloat **fatlink, gFloat** longlink, sFloat *in, sFloat ka // full dslash operator dslashReference(outOdd, fatlink, longlink, inEven, 1, daggerBit); dslashReference(outEven, fatlink, longlink, inOdd, 0, daggerBit); - - // lastly apply the kappa term - xpay(in, -kappa, out, V*mySpinorSiteSize); -} + } void @@ -150,6 +148,9 @@ mat(void *out, void **fatlink, void** longlink, void *in, double kappa, int dagg Mat((float*)out, (float**)fatlink, (float**)longlink, (float*)in, (float)kappa, dagger_bit); } } + + // lastly apply the kappa term + xpay(in, -kappa, out, V*mySpinorSiteSize, sPrecision); } @@ -220,27 +221,19 @@ matdagmat(void *out, void **fatlink, void** longlink, void *in, double mass, int // Apply the even-odd preconditioned Dirac operator template -static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, sFloat kappa, - int daggerBit, QudaMatPCType matpc_type) { +static void MatPC(sFloat *outEven, gFloat **fatlink, gFloat** longlink, sFloat *inEven, int dagger, QudaMatPCType matpc_type) { sFloat *tmp = (sFloat*)malloc(Vh*mySpinorSiteSize*sizeof(sFloat)); // full dslash operator if (matpc_type == QUDA_MATPC_EVEN_EVEN) { - dslashReference(tmp, fatlink, longlink, inEven, 1, daggerBit); - dslashReference(outEven, fatlink, longlink, tmp, 0, daggerBit); - - //dslashReference(outEven, fatlink, longlink, inEven, 1, daggerBit); + dslashReference(tmp, fatlink, longlink, inEven, 1, dagger); + dslashReference(outEven, fatlink, longlink, tmp, 0, dagger); } else { - dslashReference(tmp, fatlink, longlink, inEven, 0, daggerBit); - dslashReference(outEven, fatlink, longlink, tmp, 1, daggerBit); + dslashReference(tmp, fatlink, longlink, inEven, 0, dagger); + dslashReference(outEven, fatlink, longlink, tmp, 1, dagger); } - // lastly apply the kappa term - - sFloat kappa2 = -kappa*kappa; - xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize); - free(tmp); } @@ -252,18 +245,22 @@ staggered_matpc(void *outEven, void **fatlink, void**longlink, void *inEven, dou if (sPrecision == QUDA_DOUBLE_PRECISION) if (gPrecision == QUDA_DOUBLE_PRECISION) { - MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type); + MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type); } else{ - MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, (double)kappa, dagger_bit, matpc_type); + MatPC((double*)outEven, (double**)fatlink, (double**)longlink, (double*)inEven, dagger_bit, matpc_type); } else { if (gPrecision == QUDA_DOUBLE_PRECISION){ - MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type); + MatPC((float*)outEven, (double**)fatlink, (double**)longlink, (float*)inEven, dagger_bit, matpc_type); }else{ - MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, (float)kappa, dagger_bit, matpc_type); + MatPC((float*)outEven, (float**)fatlink, (float**)longlink, (float*)inEven, dagger_bit, matpc_type); } } + + // lastly apply the kappa term + double kappa2 = -kappa*kappa; + xpay(inEven, kappa2, outEven, Vh*mySpinorSiteSize, sPrecision); } #ifdef MULTI_GPU From da0babaac631951dcba1c25f8f7883d97969d586 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 10 Jun 2016 02:14:32 -0700 Subject: [PATCH 05/24] Set off-diagonal clover terms to be non-zero in clover invert test to make clover verification non trivial --- tests/invert_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 4a1f9e3216..962af569cf 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -316,7 +316,7 @@ int main(int argc, char **argv) } if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - double norm = 0.0; // clover components are random numbers in the range (-norm, norm) + double norm = 0.2; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); From a174af088c1329e4ed28f11ea3a3c1743d70a33b Mon Sep 17 00:00:00 2001 From: Alejandro Vaquero Date: Sun, 12 Jun 2016 12:11:14 +0200 Subject: [PATCH 06/24] Added host routines for twisted-clover Includes DYNAMIC_CLOVER support --- lib/dirac_twisted_clover.cpp | 20 ++-- lib/interface_quda.cpp | 48 +++++++-- tests/clover_reference.cpp | 168 ++++++++++++++++++++++++++++++++ tests/dslash_test.cpp | 88 +++++++++++++---- tests/wilson_dslash_reference.h | 10 ++ 5 files changed, 295 insertions(+), 39 deletions(-) diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp index 6fbf3f9e54..4763697f2e 100644 --- a/lib/dirac_twisted_clover.cpp +++ b/lib/dirac_twisted_clover.cpp @@ -333,7 +333,7 @@ namespace quda { twistedCloverDslashCuda(&static_cast(out), *gauge, cs, cI, static_cast(tmp1), QUDA_EVEN_PARITY, dagger, &static_cast(in), - QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa, 0.0, 0.0, commDim, profile); + QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa2, 0.0, 0.0, commDim, profile); flops += (1320ll+96ll)*in.Volume(); } else if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { @@ -341,7 +341,7 @@ namespace quda { twistedCloverDslashCuda(&static_cast(out), *gauge, cs, cI, static_cast(tmp1), QUDA_ODD_PARITY, dagger, &static_cast(in), - QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa, 0.0, 0.0, commDim, profile); + QUDA_DEG_DSLASH_CLOVER_TWIST_XPAY, a, kappa2, 0.0, 0.0, commDim, profile); flops += (1320ll+96ll)*in.Volume(); }else { // symmetric preconditioning errorQuda("Invalid matpcType"); @@ -386,27 +386,27 @@ namespace quda { if (matpcType == QUDA_MATPC_EVEN_EVEN) { // src = A_ee^-1 (b_e + k D_eo A_oo^-1 b_o) src = &(x.Odd()); - TwistCloverInv(*src, b.Odd(), 1); + TwistCloverInv(*src, b.Odd(), QUDA_ODD_PARITY); DiracWilson::DslashXpay(*tmp1, *src, QUDA_EVEN_PARITY, b.Even(), kappa); - TwistCloverInv(*src, *tmp1, 0); + TwistCloverInv(*src, *tmp1, QUDA_EVEN_PARITY); sol = &(x.Even()); } else if (matpcType == QUDA_MATPC_ODD_ODD) { // src = A_oo^-1 (b_o + k D_oe A_ee^-1 b_e) src = &(x.Even()); - TwistCloverInv(*src, b.Even(), 0); + TwistCloverInv(*src, b.Even(), QUDA_EVEN_PARITY); DiracWilson::DslashXpay(*tmp1, *src, QUDA_ODD_PARITY, b.Odd(), kappa); - TwistCloverInv(*src, *tmp1, 1); + TwistCloverInv(*src, *tmp1, QUDA_ODD_PARITY); sol = &(x.Odd()); } else if (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { // src = b_e + k D_eo A_oo^-1 b_o src = &(x.Odd()); - TwistCloverInv(*tmp1, b.Odd(), 1); // safe even when *tmp1 = b.odd + TwistCloverInv(*tmp1, b.Odd(), QUDA_ODD_PARITY); // safe even when *tmp1 = b.odd DiracWilson::DslashXpay(*src, *tmp1, QUDA_EVEN_PARITY, b.Even(), kappa); sol = &(x.Even()); } else if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { // src = b_o + k D_oe A_ee^-1 b_e src = &(x.Even()); - TwistCloverInv(*tmp1, b.Even(), 0); // safe even when *tmp1 = b.even + TwistCloverInv(*tmp1, b.Even(), QUDA_EVEN_PARITY); // safe even when *tmp1 = b.even DiracWilson::DslashXpay(*src, *tmp1, QUDA_ODD_PARITY, b.Odd(), kappa); sol = &(x.Odd()); } else { @@ -436,11 +436,11 @@ namespace quda { if (matpcType == QUDA_MATPC_EVEN_EVEN || matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { // x_o = A_oo^-1 (b_o + k D_oe x_e) DiracWilson::DslashXpay(*tmp1, x.Even(), QUDA_ODD_PARITY, b.Odd(), kappa); - TwistCloverInv(x.Odd(), *tmp1, 1); + TwistCloverInv(x.Odd(), *tmp1, QUDA_ODD_PARITY); } else if (matpcType == QUDA_MATPC_ODD_ODD || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { // x_e = A_ee^-1 (b_e + k D_eo x_o) DiracWilson::DslashXpay(*tmp1, x.Odd(), QUDA_EVEN_PARITY, b.Even(), kappa); - TwistCloverInv(x.Even(), *tmp1, 0); + TwistCloverInv(x.Even(), *tmp1, QUDA_EVEN_PARITY); } else { errorQuda("MatPCType %d not valid for DiracTwistedCloverPC", matpcType); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index a7e50db23c..cdebe9b9aa 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -753,9 +753,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) CloverFieldParam clover_param; CloverField *in=NULL; -#ifndef DYNAMIC_CLOVER CloverField *inInv=NULL; -#endif if(!device_calc){ // create a param for the cpu clover field @@ -781,23 +779,22 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) cpuParam.inverse = false; cpuParam.cloverInv = NULL; cpuParam.clover = h_clover; + cpuParam.twisted = true; cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? static_cast(new cpuCloverField(cpuParam)) : static_cast(new cudaCloverField(cpuParam)); -#ifndef DYNAMIC_CLOVER cpuParam.cloverInv = h_clovinv; cpuParam.clover = NULL; cpuParam.twisted = true; - cpuParam.direct = true; - cpuParam.inverse = false; + cpuParam.direct = false; + cpuParam.inverse = true; cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? static_cast(new cpuCloverField(cpuParam)) : static_cast(new cudaCloverField(cpuParam)); -#endif } else { in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? static_cast(new cpuCloverField(cpuParam)) : @@ -814,6 +811,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; clover_param.direct = true; clover_param.inverse = false; clover_param.twisted = true; @@ -823,7 +821,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) clover_param.inverse = true; clover_param.twisted = true; cloverInvPrecise = new cudaCloverField(clover_param); -// clover_param.twisted = false; #endif } else { cloverPrecise = new cudaCloverField(clover_param); @@ -958,6 +955,41 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) #endif } + // need to copy back the inverse field for twisted-clover tests + if (h_clovinv && !device_calc && (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { + // copy the inverted clover term into host application order on the device + clover_param.setPrecision(inv_param->clover_cpu_prec); + clover_param.direct = false; + clover_param.inverse = true; + clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; + clover_param.twisted = true; + + // this isn't really "epilogue" but this label suffices + profileClover.TPSTART(QUDA_PROFILE_EPILOGUE); +#ifndef DYNAMIC_CLOVER + clover_param.order = inv_param->clover_order; + cudaCloverField hack(clover_param); + hack.copy(*cloverInvPrecise); +#else + cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack + hackOfTheHack->copy(*cloverPrecise); + cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + clover_param.order = inv_param->clover_order; + cudaCloverField hack(clover_param); + hack.copy(*hackOfTheHack); + delete hackOfTheHack; +#endif + profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE); + + // copy the field into the host application's clover field + profileClover.TPSTART(QUDA_PROFILE_D2H); + qudaMemcpy((char*)(inInv->V(true)), (char*)(hack.V(true)), + inInv->Bytes(), cudaMemcpyDeviceToHost); + profileClover.TPSTOP(QUDA_PROFILE_D2H); + + checkCudaError(); + } + // need to copy back the odd inverse field into the application clover field if (!h_clovinv && pc_solve && !device_calc) { // copy the inverted clover term into host application order on the device @@ -984,9 +1016,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if(!device_calc) { if (in) delete in; // delete object referencing input field -#ifndef DYNAMIC_CLOVER if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv; -#endif } popVerbosity(); diff --git a/tests/clover_reference.cpp b/tests/clover_reference.cpp index 881f383e4e..c108185ad1 100644 --- a/tests/clover_reference.cpp +++ b/tests/clover_reference.cpp @@ -171,3 +171,171 @@ void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, free(tmp); } + +void applyTwist(void *out, void *in, void *tmpH, double a, QudaPrecision precision) { + switch (precision) { + case QUDA_DOUBLE_PRECISION: + for(int i = 0; i < Vh; i++) + for(int s = 0; s < 4; s++) { + double a5 = ((s / 2) ? -1.0 : +1.0) * a; + for(int c = 0; c < 3; c++) { + ((double *) out)[i * 24 + s * 6 + c * 2 + 0] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((double *) in)[i * 24 + s * 6 + c * 2 + 1]; + ((double *) out)[i * 24 + s * 6 + c * 2 + 1] = ((double *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((double *) in)[i * 24 + s * 6 + c * 2 + 0]; + } + } + break; + case QUDA_SINGLE_PRECISION: + for(int i = 0; i < Vh; i++) + for(int s = 0; s < 4; s++) { + float a5 = ((s / 2) ? -1.0 : +1.0) * a; + for(int c = 0; c < 3; c++) { + ((float *) out)[i * 24 + s * 6 + c * 2 + 0] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 0] - a5*((float *) in)[i * 24 + s * 6 + c * 2 + 1]; + ((float *) out)[i * 24 + s * 6 + c * 2 + 1] = ((float *) tmpH)[i * 24 + s * 6 + c * 2 + 1] + a5*((float *) in)[i * 24 + s * 6 + c * 2 + 0]; + } + } + break; + default: + errorQuda("Unsupported precision %d", precision); + } +} + +// Apply (C + i*a*gamma_5)/(C^2 + a^2) +void twistCloverGamma5(void *out, void *in, void *clover, void *cInv, const int dagger, const double kappa, const double mu, + const QudaTwistFlavorType flavor, const int parity, QudaTwistGamma5Type twist, QudaPrecision precision) { + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + double a = 0.0; + + if (twist == QUDA_TWIST_GAMMA5_DIRECT) { + a = 2.0 * kappa * mu * flavor; + + if (dagger) a *= -1.0; + + apply_clover(tmp1, clover, in, parity, precision); + applyTwist(out, in, tmp1, a, precision); + } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) { + a = -2.0 * kappa * mu * flavor; + + if (dagger) a *= -1.0; + + apply_clover(tmp1, clover, in, parity, precision); + applyTwist(tmp2, in, tmp1, a, precision); + apply_clover(out, cInv, tmp2, parity, precision); + } else { + printf("Twist type %d not defined\n", twist); + exit(0); + } + + free(tmp2); + free(tmp1); +} + +void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam ¶m) { + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + if (dagger) { + twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1-parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { + wil_dslash(tmp2, gauge, tmp1, parity, dagger, precision, param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + wil_dslash(out, gauge, tmp1, parity, dagger, precision, param); + } + } else { + wil_dslash(tmp1, gauge, in, parity, dagger, precision, param); + twistCloverGamma5(out, tmp1, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision); + } + + free(tmp2); + free(tmp1); +} + +// Apply the full twisted-clover operator +void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, + QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + void *tmp = malloc(V*spinorSiteSize*precision); + + void *inEven = in; + void *inOdd = (char*)in + Vh*spinorSiteSize*precision; + void *outEven = out; + void *outOdd = (char*)out + Vh*spinorSiteSize*precision; + void *tmpEven = tmp; + void *tmpOdd = (char*)tmp + Vh*spinorSiteSize*precision; + + // Odd part + wil_dslash(outOdd, gauge, inEven, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmpOdd, inOdd, clover, NULL, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision); + + // Even part + wil_dslash(outEven, gauge, inOdd, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmpEven, inEven, clover, NULL, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision); + + // lastly apply the kappa term + xpay(tmp, -kappa, out, V*spinorSiteSize, precision); + + free(tmp); +} + +// Apply the even-odd preconditioned Dirac operator +void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param) { + + double kappa2 = -kappa*kappa; + + void *tmp1 = malloc(Vh*spinorSiteSize*precision); + void *tmp2 = malloc(Vh*spinorSiteSize*precision); + + switch(matpc_type) { + case QUDA_MATPC_EVEN_EVEN: + if (!dagger) { + wil_dslash(out, gauge, in, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp2, gauge, tmp1, 0, dagger, precision, gauge_param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp1, gauge, out, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_EVEN_EVEN_ASYMMETRIC: + wil_dslash(tmp1, gauge, in, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, in, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_DIRECT, precision); + xpay(tmp2, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD: + if (!dagger) { + wil_dslash(out, gauge, in, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, out, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp2, gauge, tmp1, 1, dagger, precision, gauge_param); + twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + } else { + twistCloverGamma5(out, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(tmp1, gauge, out, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param); + } + xpay(in, kappa2, out, Vh*spinorSiteSize, precision); + break; + case QUDA_MATPC_ODD_ODD_ASYMMETRIC: + wil_dslash(tmp1, gauge, in, 0, dagger, precision, gauge_param); + twistCloverGamma5(tmp2, tmp1, clover, cInv, dagger, kappa, mu, flavor, 0, QUDA_TWIST_GAMMA5_INVERSE, precision); + wil_dslash(out, gauge, tmp2, 1, dagger, precision, gauge_param); + twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1, QUDA_TWIST_GAMMA5_DIRECT, precision); + xpay(tmp1, kappa2, out, Vh*spinorSiteSize, precision); + break; + default: + errorQuda("Unsupported matpc=%d", matpc_type); + } + + free(tmp2); + free(tmp1); +} diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 8424818f08..83c36344d7 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -229,7 +229,7 @@ void init(int argc, char **argv) { inv_param.dslash_type = dslash_type; - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; @@ -242,8 +242,14 @@ void init(int argc, char **argv) { hostClover = NULL; hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); }*/ - } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { - + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + inv_param.clover_cpu_prec = cpu_prec; + inv_param.clover_cuda_prec = cuda_prec; + inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; + inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.clover_coeff = 1.5*inv_param.kappa; + hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); + hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); } setVerbosity(QUDA_VERBOSE); @@ -255,9 +261,6 @@ void init(int argc, char **argv) { csParam.nColor = 3; csParam.nSpin = 4; - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - csParam.twistFlavor = inv_param.twist_flavor; - } csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || @@ -274,7 +277,7 @@ void init(int argc, char **argv) { } //ndeg_tm - if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { csParam.twistFlavor = inv_param.twist_flavor; csParam.nDim = (inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) ? 4 : 5; csParam.x[4] = inv_param.Ls; @@ -332,6 +335,12 @@ void init(int argc, char **argv) { } else { construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); } + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + // We are not so aggressive with the random clover term here because we need to invert it, so it must make some sense + double norm = 0.2; // clover components are random numbers in the range (-norm, norm) + double diag = 1.0; // constant added to the diagonal + + construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); } printfQuda("done.\n"); fflush(stdout); @@ -340,16 +349,11 @@ void init(int argc, char **argv) { printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { printfQuda("Sending clover field to GPU\n"); loadCloverQuda(hostClover, hostCloverInv, &inv_param); } - if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - printfQuda("Sending clover field to GPU\n"); - loadCloverQuda(NULL, NULL, &inv_param); - } - if (!transfer) { csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; @@ -444,7 +448,7 @@ void end() { delete spinorTmp; for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); - if((dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)){ + if((dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { if (hostClover != hostCloverInv && hostClover) free(hostClover); free(hostCloverInv); } @@ -548,12 +552,16 @@ double dslashCUDA(int niter) { } else { switch (test_type) { case 0: - if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH && (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD)) { + if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { - ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2); - dirac->Dslash(*cudaSpinorOut, *tmp1, parity); + if (dagger) { + ((DiracTwistedCloverPC *) dirac)->TwistCloverInv(*tmp1, *cudaSpinor, (parity+1)%2); + dirac->Dslash(*cudaSpinorOut, *tmp1, parity); + } else { + dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); + } } } else { if (transfer) { @@ -671,7 +679,7 @@ void dslashRef() { printfQuda("Test type not defined\n"); exit(-1); } - } else if ((dslash_type == QUDA_TWISTED_MASS_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { + } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { switch (test_type) { case 0: if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) @@ -723,7 +731,7 @@ void dslashRef() { } break; case 3: - if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){ + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { tm_matpc(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); tm_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, @@ -747,7 +755,7 @@ void dslashRef() { } break; case 4: - if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS){ + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { tm_mat(spinorTmp->V(), hostGauge, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); tm_mat(spinorRef->V(), hostGauge, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, @@ -774,6 +782,46 @@ void dslashRef() { printfQuda("Test type not defined\n"); exit(-1); } + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + switch (test_type) { + case 0: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_dslash(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 1: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_matpc(spinorRef->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 2: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) + tmc_mat(spinorRef->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); + else + errorQuda("Not supported\n"); + break; + case 3: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { + tmc_matpc(spinorTmp->V(), hostGauge, spinor->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + inv_param.matpc_type, dagger, inv_param.cpu_prec, gauge_param); + tmc_matpc(spinorRef->V(), hostGauge, spinorTmp->V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); + } else + errorQuda("Not supported\n"); + break; + case 4: + if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) { + tmc_mat(spinorTmp->V(), hostGauge, hostClover, spinor->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, dagger, inv_param.cpu_prec, gauge_param); + tmc_mat(spinorRef->V(), hostGauge, hostClover, spinorTmp->V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param); + } else + errorQuda("Not supported\n"); + break; + default: + printfQuda("Test type not defined\n"); + exit(-1); + } } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH ){ switch (test_type) { case 0: diff --git a/tests/wilson_dslash_reference.h b/tests/wilson_dslash_reference.h index 4b270df7a8..0dc929a299 100644 --- a/tests/wilson_dslash_reference.h +++ b/tests/wilson_dslash_reference.h @@ -28,6 +28,16 @@ extern "C" { QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam ¶m); + void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, + double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, + int daggerBit, QudaPrecision sprecision, QudaGaugeParam ¶m); + + void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, + QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + + void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, + QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param); + void tm_ndeg_dslash(void *res1, void *res2, void **gaugeFull, void *spinorField1, void *spinorField2, double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param); From 9b1795eda6dc06fa4a520a2508743db9c92c0f60 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 00:25:59 -0700 Subject: [PATCH 07/24] Fixed bug in tuning of clover inversion if source destination are the same: source must be preserved and restored since tuning destroys the input. --- include/clover_field_order.h | 28 ++++++++++++++++++++++++++-- lib/clover_invert.cu | 4 ++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 91d964e482..a3c2c774bf 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -259,9 +259,12 @@ namespace quda { const bool twisted; const Float mu2; - + + size_t bytes; + void *backup_h; //! host memory for backing up the field when tuning + FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()), - twisted(clover.Twisted()), mu2(clover.Mu2()) { + twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()), backup_h(nullptr) { this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse)); this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2); this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse)); @@ -313,6 +316,26 @@ namespace quda { } } + /** + @brief Backup the field to the host when tuning + */ + void save() { + if (backup_h) errorQuda("Already allocated host backup"); + backup_h = safe_malloc(bytes); + cudaMemcpy(backup_h, clover[0], bytes, cudaMemcpyDeviceToHost); + checkCudaError(); + } + + /** + @brief Restore the field from the host after tuning + */ + void load() { + cudaMemcpy(clover[0], backup_h, bytes, cudaMemcpyHostToDevice); + host_free(backup_h); + backup_h = nullptr; + checkCudaError(); + } + size_t Bytes() const { size_t bytes = length*sizeof(Float); if (sizeof(Float)==sizeof(short)) bytes += 2*sizeof(float); @@ -320,6 +343,7 @@ namespace quda { } }; + /** QDP ordering for clover fields */ diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index 2764a84e2f..77bd0fba2b 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -254,6 +254,10 @@ namespace quda { long long flops() const { return 0; } long long bytes() const { return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); } + + void preTune() { if (arg.clover.clover[0] == arg.inverse.clover[0]) arg.inverse.save(); } + void postTune() { if (arg.clover.clover[0] == arg.inverse.clover[0]) arg.inverse.load(); } + }; template From 5f028db11bc5111032dc9ab107dc36b681728f60 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 14:48:08 -0700 Subject: [PATCH 08/24] clover::FloatNOrder now uses vectorized load/store for improved performance of all algorithms that use this (clover inversion sees a 1.5x speedup). Added missing support for clover norm field save/restore in tuning. --- include/clover_field_order.h | 84 ++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 28 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index a3c2c774bf..b890d475e4 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -252,8 +252,12 @@ namespace quda { template struct FloatNOrder { typedef typename mapper::type RegType; - Float *clover[2]; - float *norm[2]; + typedef typename VectorType::type Vector; + + Float *clover; + float *norm; + size_t offset; + size_t norm_offset; const int volumeCB; const int stride; @@ -261,30 +265,38 @@ namespace quda { const Float mu2; size_t bytes; + size_t norm_bytes; void *backup_h; //! host memory for backing up the field when tuning - - FloatNOrder(const CloverField &clover, bool inverse, Float *clover_=0, float *norm_=0) : volumeCB(clover.VolumeCB()), stride(clover.Stride()), - twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()), backup_h(nullptr) { - this->clover[0] = clover_ ? clover_ : (Float*)(clover.V(inverse)); - this->clover[1] = (Float*)((char*)this->clover[0] + clover.Bytes()/2); - this->norm[0] = norm_ ? norm_ : (float*)(clover.Norm(inverse)); - this->norm[1] = (float*)((char*)this->norm[0] + clover.NormBytes()/2); - } + void *backup_norm_h; //! host memory for backing up norm when tuning + + FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0) : + offset(clover.Bytes()/(2*sizeof(Float))), norm_offset(clover.NormBytes()/(2*sizeof(float))), + volumeCB(clover.VolumeCB()), stride(clover.Stride()), + twisted(clover.Twisted()), mu2(clover.Mu2()), bytes(clover.Bytes()), + norm_bytes(clover.NormBytes()), backup_h(nullptr), backup_norm_h(nullptr) + { + this->clover = clover_ ? clover_ : (Float*)(clover.V(is_inverse)); + this->norm = norm_ ? norm_ : (float*)(clover.Norm(is_inverse)); + } bool Twisted() const {return twisted;} Float Mu2() const {return mu2;} __device__ __host__ inline void load(RegType v[length], int x, int parity) const { - const int M=length/(N*2); + const int M = length/(N*2); +#pragma unroll for (int chirality=0; chirality<2; chirality++) { +#pragma unroll for (int i=0; i(clover + parity*offset, x + stride*(chirality*M+i)); + // second do scalar copy converting into register type + for (int j=0; j(&vecTmp)[j]); } + + if (sizeof(Float)==sizeof(short)) +#pragma unroll + for (int i=0; i scale[chi] ? fabs(v[chi*M+i]) : scale[chi]; - norm[parity][chi*stride + x] = scale[chi]; + norm[parity*norm_offset + chi*stride + x] = scale[chi]; } } const int M=length/(N*2); for (int chirality=0; chirality<2; chirality++) { + +#pragma unroll for (int i=0; i(&vecTmp)[j], v[(chirality*M+i)*N+j] / scale[chirality]); + else +#pragma unroll + for (int j=0; j(&vecTmp)[j], v[(chirality*M+i)*N+j]); + + // second do vectorized copy into memory + reinterpret_cast(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp; } } } @@ -322,7 +341,11 @@ namespace quda { void save() { if (backup_h) errorQuda("Already allocated host backup"); backup_h = safe_malloc(bytes); - cudaMemcpy(backup_h, clover[0], bytes, cudaMemcpyDeviceToHost); + cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost); + if (norm_bytes) { + backup_norm_h = safe_malloc(norm_bytes); + cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost); + } checkCudaError(); } @@ -330,9 +353,14 @@ namespace quda { @brief Restore the field from the host after tuning */ void load() { - cudaMemcpy(clover[0], backup_h, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice); host_free(backup_h); backup_h = nullptr; + if (norm_bytes) { + cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice); + host_free(backup_norm_h); + backup_norm_h = nullptr; + } checkCudaError(); } From 2a6a94110c941eee1bf8adb3d1688d8fa67f94ec Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 14:49:17 -0700 Subject: [PATCH 09/24] Updated clover inversion preTune/postTune for direct pointer access introduced in clover::FloatNOrder in last commit --- lib/clover_invert.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index 77bd0fba2b..07be38ae98 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -255,8 +255,8 @@ namespace quda { long long flops() const { return 0; } long long bytes() const { return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); } - void preTune() { if (arg.clover.clover[0] == arg.inverse.clover[0]) arg.inverse.save(); } - void postTune() { if (arg.clover.clover[0] == arg.inverse.clover[0]) arg.inverse.load(); } + void preTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.save(); } + void postTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.load(); } }; From ae3e7ac825a980b9eff9fd5e74c51cfca26266b0 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 16:12:27 -0700 Subject: [PATCH 10/24] Set clover norm to 0.1, as 0.2 leads to poorly conditioned clover matrix that results in signficant errors in cholesky decomposition leading to failure in twisted-clover test. --- tests/dslash_test.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 83c36344d7..eacefcf315 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -75,6 +75,8 @@ extern char latfile[]; extern bool kernel_pack_t; +QudaVerbosity verbosity = QUDA_VERBOSE; + void init(int argc, char **argv) { cuda_prec = prec; @@ -252,8 +254,6 @@ void init(int argc, char **argv) { hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); } - setVerbosity(QUDA_VERBOSE); - // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); @@ -337,7 +337,7 @@ void init(int argc, char **argv) { } } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { // We are not so aggressive with the random clover term here because we need to invert it, so it must make some sense - double norm = 0.2; // clover components are random numbers in the range (-norm, norm) + double norm = 0.1; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); @@ -346,6 +346,15 @@ void init(int argc, char **argv) { initQuda(device); + if (tune) { + setTuning(QUDA_TUNE_YES); + printfQuda("Tuning...\n"); + } + + // set verbosity prior to loadGaugeQuda + setVerbosity(verbosity); + inv_param.verbosity = verbosity; + printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); @@ -981,8 +990,6 @@ int main(int argc, char **argv) for (int i=0; i Date: Wed, 15 Jun 2016 19:55:06 -0700 Subject: [PATCH 11/24] Clover inversion now uses grid-wide reduction rather than atomics to compute trace log: this makes the result reproducible after tuning has completed. Trace-log computation and twist parameters are now templated to improve code generation. --- lib/clover_invert.cu | 121 +++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 51 deletions(-) diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index 07be38ae98..1bfca4dd79 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace quda { @@ -13,27 +13,24 @@ namespace quda { #ifdef GPU_CLOVER_DIRAC template - struct CloverInvertArg { + struct CloverInvertArg : public ReduceArg { const Clover clover; Clover inverse; bool computeTraceLog; - double * const trlogA_h; - double *trlogA_d; //extra attributes for twisted mass clover bool twist; double mu2; - CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0, double* const trlogA=0) : - inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), trlogA_h(trlogA), twist(clover.Twisted()), mu2(clover.Mu2()){ - cudaHostGetDevicePointer(&trlogA_d, trlogA_h, 0); // set the matching device pointer - } + CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0) : + ReduceArg(), inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), + twist(clover.Twisted()), mu2(clover.Mu2()) { } }; /** Use a Cholesky decomposition to invert the clover matrix Here we use an inplace inversion which hopefully reduces register pressure */ - template - __device__ __host__ double cloverInvertCompute(CloverInvertArg arg, int x, int parity) { + template + __device__ __host__ inline double cloverInvertCompute(CloverInvertArg arg, int x, int parity) { Float A[72]; double trlogA = 0.0; @@ -50,14 +47,14 @@ namespace quda { // hack into the right order as MILC just to copy algorithm directly // FIXME use native ordering in the Cholseky // factor of two is inherent to QUDA clover storage - for (int i=0; i<6; i++) diag[i] = 2.0*A[ch*36+i]; + constexpr Float two = static_cast(2.0); + for (int i=0; i<6; i++) diag[i] = two*A[ch*36+i]; const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; - for (int i=0; i<15; i++) tri[idtab[i]] = complex(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]); + for (int i=0; i<15; i++) tri[idtab[i]] = complex(two*A[ch*36+6+2*i], two*A[ch*36+6+2*i+1]); -//Compute (T^2 + mu2) first, then invert (not optimized!): - if(arg.twist) - { + //Compute (T^2 + mu2) first, then invert (not optimized!): + if (twist) { complex aux[15];//hmmm, better to reuse A-regs... //another solution just to define (but compiler may not be happy with this, swapping everything in //the global buffer): @@ -103,10 +100,10 @@ namespace quda { diag[4] = (Float)arg.mu2+diag[4]*diag[4]+norm(tri[ 6])+norm(tri[ 7])+norm(tri[ 8])+norm(tri[ 9])+norm(tri[14]); diag[5] = (Float)arg.mu2+diag[5]*diag[5]+norm(tri[10])+norm(tri[11])+norm(tri[12])+norm(tri[13])+norm(tri[14]); - //update off-diagonal elements: + //update off-diagonal elements: for(int i = 0; i < 15; i++) tri[i] = aux[i]; } -// + for (int j=0; j<6; j++) { diag[j] = sqrt(diag[j]); tmp[j] = 1.0 / diag[j]; @@ -128,7 +125,7 @@ namespace quda { } /* Accumulate trlogA */ - for (int j=0;j<6;j++) trlogA += (double)2.0*log((double)(diag[j])); + if (computeTrLog) for (int j=0;j<6;j++) trlogA += 2.0*log((double)(diag[j])); /* Now use forward and backward substitution to construct inverse */ complex v1[6]; @@ -165,9 +162,10 @@ namespace quda { } } - for (int i=0; i<6; i++) A[ch*36+i] = 0.5 * diag[i]; + constexpr Float half = static_cast(0.5); + for (int i=0; i<6; i++) A[ch*36+i] = half * diag[i]; for (int i=0; i<15; i++) { - A[ch*36+6+2*i] = 0.5*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = 0.5*tri[idtab[i]].imag(); + A[ch*36+6+2*i] = half*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = half*tri[idtab[i]].imag(); } } @@ -177,67 +175,82 @@ namespace quda { return trlogA; } - template + template void cloverInvert(CloverInvertArg arg) { for (int parity=0; parity<2; parity++) { for (int x=0; x(arg, x, parity); - if (arg.computeTraceLog) arg.trlogA_h[parity] += trlogA; + double trlogA = cloverInvertCompute(arg, x, parity); + if (computeTrLog) { + if (parity) arg.result_h[0].y += trlogA; + else arg.result_h[0].x += trlogA; + } } } } - template + template + __launch_bounds__(2*blockSize) __global__ void cloverInvertKernel(CloverInvertArg arg) { int idx = blockIdx.x*blockDim.x + threadIdx.x; - //if (idx >= arg.clover.volumeCB) return; - int parity = blockIdx.y; - double trlogA = 0.0; - if (idx < arg.clover.volumeCB) trlogA = cloverInvertCompute(arg, idx, parity); - - if (arg.computeTraceLog) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - double aggregate = BlockReduce(temp_storage).Sum(trlogA); - if (threadIdx.x == 0) atomicAdd(arg.trlogA_d+parity, aggregate); - } + int parity = threadIdx.y; + double trlogA_parity = 0.0; + if (idx < arg.clover.volumeCB) + trlogA_parity = cloverInvertCompute(arg, idx, parity); + double2 trlogA = parity ? make_double2(0.0,trlogA_parity) : make_double2(trlogA_parity, 0.0); + if (computeTrLog) reduce2d(arg, trlogA); } template - class CloverInvert : Tunable { + class CloverInvert : TunableLocalParity { CloverInvertArg arg; const CloverField &meta; // used for meta data only const QudaFieldLocation location; private: - unsigned int sharedBytesPerThread() const { return 0; } - unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0 ;} - bool tuneSharedBytes() const { return false; } // Don't tune the shared memory - bool tuneGridDim() const { return false; } // Don't tune the grid dimensions. unsigned int minThreads() const { return arg.clover.volumeCB; } public: CloverInvert(CloverInvertArg &arg, const CloverField &meta, QudaFieldLocation location) : arg(arg), meta(meta), location(location) { - writeAuxString("stride=%d,prec=%lu",arg.clover.stride,sizeof(Float)); + writeAuxString("stride=%d,prec=%lu,trlog=%s,twist=%s", arg.clover.stride, sizeof(Float), + arg.computeTraceLog ? "true" : "false", arg.twist ? "true" : "false"); } virtual ~CloverInvert() { ; } void apply(const cudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - arg.trlogA_h[0] = 0.0; arg.trlogA_h[1] = 0.0; + arg.result_h[0] = make_double2(0.,0.); if (location == QUDA_CUDA_FIELD_LOCATION) { - tp.grid.y = 2; // for parity - LAUNCH_KERNEL(cloverInvertKernel, tp, stream, arg, Float, Clover); + if (arg.computeTraceLog) { + if (arg.twist) { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, true); + } else { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, true, false); + } + } else { + if (arg.twist) { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, true); + } else { + LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Clover, false, false); + } + } } else { - cloverInvert<1, Float, Clover>(arg); - } - if (arg.computeTraceLog) { - cudaDeviceSynchronize(); - reduceDoubleArray(arg.trlogA_h, 2); + if (arg.computeTraceLog) { + if (arg.twist) { + cloverInvert<1, Float, Clover, true, true>(arg); + } else { + cloverInvert<1, Float, Clover, true, false>(arg); + } + } else { + if (arg.twist) { + cloverInvert<1, Float, Clover, false, true>(arg); + } else { + cloverInvert<1, Float, Clover, false, false>(arg); + } + } } } @@ -263,10 +276,16 @@ namespace quda { template void cloverInvert(Clover inverse, const Clover clover, bool computeTraceLog, double* const trlog, const CloverField &meta, QudaFieldLocation location) { - CloverInvertArg arg(inverse, clover, computeTraceLog, trlog); + CloverInvertArg arg(inverse, clover, computeTraceLog); CloverInvert invert(arg, meta, location); invert.apply(0); - cudaDeviceSynchronize(); + + if (arg.computeTraceLog) { + cudaDeviceSynchronize(); + comm_allreduce_array((double*)arg.result_h, 2); + trlog[0] = arg.result_h[0].x; + trlog[1] = arg.result_h[0].y; + } } template From cab5e731e52667ca4ef8099c17a20bf57f77ef6d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 22:41:20 -0700 Subject: [PATCH 12/24] Display last error when autotuning fails --- lib/tune.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tune.cpp b/lib/tune.cpp index 7889039fc4..268810ba2c 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -599,7 +599,7 @@ namespace quda { } if (best_time == FLT_MAX) { - errorQuda("Auto-tuning failed for %s with %s at vol=%s", key.name, key.aux, key.volume); + errorQuda("Auto-tuning failed for %s with %s at vol=%s, error %s", key.name, key.aux, key.volume, cudaGetErrorString(error)); } if (verbosity >= QUDA_VERBOSE) { printfQuda("Tuned %s giving %s for %s with %s\n", tunable.paramString(best_param).c_str(), From a671b9cbb6c62b25e4c3f4f00ba4da1cd222f3d7 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 15 Jun 2016 22:54:38 -0700 Subject: [PATCH 13/24] Fixed warning in last commit --- lib/tune.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tune.cpp b/lib/tune.cpp index 268810ba2c..16e8906de9 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -536,7 +536,7 @@ namespace quda { } else if (!tuning) { TuneParam best_param; - cudaError_t error; + cudaError_t error = cudaSuccess; cudaEvent_t start, end; float elapsed_time, best_time; time_t now; From 9da9306ce5e1f945aac8b80b20ce76f3608591d5 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 17 Jun 2016 00:10:14 -0700 Subject: [PATCH 14/24] Merged separate clover and clover inverse fields for twisted-clover fermions. Fixed bug affecting dyanmic-clover caused by dereferencing a null pointer. --- include/dirac_quda.h | 4 +- include/multigrid.h | 3 +- lib/coarse_op.cu | 66 ++++++------- lib/dirac_clover.cpp | 6 +- lib/dirac_twisted_clover.cpp | 60 +++--------- lib/dirac_twisted_mass.cpp | 4 +- lib/dirac_wilson.cpp | 2 +- lib/interface_quda.cpp | 174 +++++++---------------------------- 8 files changed, 82 insertions(+), 237 deletions(-) diff --git a/include/dirac_quda.h b/include/dirac_quda.h index c62502b8ae..9488f4ca96 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -33,7 +33,6 @@ namespace quda { cudaGaugeField *fatGauge; // used by staggered only cudaGaugeField *longGauge; // used by staggered only cudaCloverField *clover; - cudaCloverField *cloverInv; double mu; // used by twisted mass only double epsilon; //2nd tm parameter (used by twisted mass only) @@ -49,7 +48,7 @@ namespace quda { DiracParam() : type(QUDA_INVALID_DIRAC), kappa(0.0), m5(0.0), matpcType(QUDA_MATPC_INVALID), - dagger(QUDA_DAG_INVALID), gauge(0), clover(0), cloverInv(0), mu(0.0), epsilon(0.0), + dagger(QUDA_DAG_INVALID), gauge(0), clover(0), mu(0.0), epsilon(0.0), tmp1(0), tmp2(0) { @@ -537,7 +536,6 @@ namespace quda { double mu; double epsilon; cudaCloverField &clover; - cudaCloverField &cloverInv; void checkParitySpinor(const ColorSpinorField &, const ColorSpinorField &) const; void twistedCloverApply(ColorSpinorField &out, const ColorSpinorField &in, const QudaTwistGamma5Type twistType, const int parity) const; diff --git a/include/multigrid.h b/include/multigrid.h index 682442e6e8..a8e1eb95d6 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -329,7 +329,6 @@ namespace quda { @param T[in] Transfer operator that defines the coarse space @param gauge[in] Gauge field from fine grid @param clover[in] Clover field on fine grid (optional) - @param cloverInv[in] Inverse Clover field on fine grid (optional, only for twisted-clover) @param kappa[in] Kappa parameter @param mu[in] Mu parameter (set to non-zero for twisted-mass/twisted-clover) @param matpc[in] The type of even-odd preconditioned fine-grid @@ -338,7 +337,7 @@ namespace quda { even-odd preconditioned and we coarsen the full operator. */ void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T, - const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv, + const cudaGaugeField &gauge, const cudaCloverField *clover, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc); /** diff --git a/lib/coarse_op.cu b/lib/coarse_op.cu index 1dc9c3f0ed..5a0e28b24d 100644 --- a/lib/coarse_op.cu +++ b/lib/coarse_op.cu @@ -15,7 +15,7 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { typedef typename colorspinor::FieldOrderCB F; typedef typename gauge::FieldOrder gFine; @@ -34,7 +34,7 @@ namespace quda { gCoarse xAccessor(const_cast(X)); gCoarse xInvAccessor(const_cast(Xinv)); cFine cAccessor(const_cast(c), false); - cFine cInvAccessor(const_cast(cI), true); + cFine cInvAccessor(const_cast(c), true); calculateY (yAccessor, xAccessor, xInvAccessor, uvAccessor, avAccessor, vAccessor, gAccessor, cAccessor, cInvAccessor, Y, X, Xinv, Yhat, av, v, kappa, mu, dirac, matpc); @@ -44,28 +44,28 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (T.Vectors().Nspin()/T.Spin_bs() != 2) errorQuda("Unsupported number of coarse spins %d\n",T.Vectors().Nspin()/T.Spin_bs()); const int coarseSpin = 2; const int coarseColor = Y.Ncolor() / coarseSpin; if (coarseColor == 2) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 4) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 8) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 12) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 16) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 20) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 24) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else if (coarseColor == 32) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor()); } @@ -74,9 +74,9 @@ namespace quda { // template on fine spin template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (uv.Nspin() == 4) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of spins %d\n", uv.Nspin()); } @@ -85,9 +85,9 @@ namespace quda { // template on fine colors template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (g.Ncolor() == 3) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported number of colors %d\n", g.Ncolor()); } @@ -95,10 +95,10 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { //If c == NULL, then this is standard Wilson. csOrder is dummy and will not matter if (c.Order() == QUDA_PACKED_CLOVER_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", c.Order()); } @@ -106,9 +106,9 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (g.FieldOrder() == QUDA_QDP_GAUGE_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", g.FieldOrder()); } @@ -116,9 +116,9 @@ namespace quda { template void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (T.Vectors().FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported field order %d\n", T.Vectors().FieldOrder()); } @@ -126,7 +126,7 @@ namespace quda { //Does the heavy lifting of creating the coarse color matrices Y void calculateY(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, ColorSpinorField &uv, ColorSpinorField &av, const Transfer &T, - const GaugeField &g, const CloverField &c, const CloverField &cI, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { + const GaugeField &g, const CloverField &c, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { if (X.Precision() != Y.Precision() || Y.Precision() != uv.Precision() || Y.Precision() != T.Vectors().Precision() || Y.Precision() != g.Precision()) errorQuda("Unsupported precision mix"); @@ -135,12 +135,12 @@ namespace quda { if (Y.Precision() == QUDA_DOUBLE_PRECISION) { #ifdef GPU_MULTIGRID_DOUBLE - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); #else errorQuda("Double precision multigrid has not been enabled"); #endif } else if (Y.Precision() == QUDA_SINGLE_PRECISION) { - calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, cI, kappa, mu, dirac, matpc); + calculateY(Y, X, Xinv, Yhat, uv, av, T, g, c, kappa, mu, dirac, matpc); } else { errorQuda("Unsupported precision %d\n", Y.Precision()); } @@ -150,7 +150,7 @@ namespace quda { //Calculates the coarse color matrix and puts the result in Y. //N.B. Assumes Y, X have been allocated. void CoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T, - const cudaGaugeField &gauge, const cudaCloverField *clover, const cudaCloverField *cloverInv, + const cudaGaugeField &gauge, const cudaCloverField *clover, double kappa, double mu, QudaDiracType dirac, QudaMatPCType matpc) { QudaPrecision precision = Y.Precision(); //First make a cpu gauge field from the cuda gauge field @@ -203,19 +203,9 @@ namespace quda { cf_param.create = QUDA_NULL_FIELD_CREATE; cf_param.siteSubset = QUDA_FULL_SITE_SUBSET; - if (cloverInv && (dirac == QUDA_TWISTED_CLOVERPC_DIRAC)) { - cf_param.direct = false; - cpuCloverField cI(cf_param); - cloverInv->saveCPUField(cI); - cf_param.direct = true; - cpuCloverField c(cf_param); - clover->saveCPUField(c); - calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, cI, kappa, mu, dirac, matpc); - } else { - cpuCloverField c(cf_param); - if (clover) clover->saveCPUField(c); - calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, c, kappa, mu, dirac, matpc); - } + cpuCloverField c(cf_param); + if (clover) clover->saveCPUField(c); + calculateY(Y, X, Xinv, Yhat, *uv, *av, T, g, c, kappa, mu, dirac, matpc); if (&T.Vectors() != av) delete av; delete uv; diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp index 76f4a45d04..6d12fa2bc8 100644 --- a/lib/dirac_clover.cpp +++ b/lib/dirac_clover.cpp @@ -158,8 +158,7 @@ namespace quda { } void DiracClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { - cudaCloverField *cInv = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVER_DIRAC, QUDA_MATPC_INVALID); } DiracCloverPC::DiracCloverPC(const DiracParam ¶m) : @@ -379,8 +378,7 @@ namespace quda { } void DiracCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { - cudaCloverField *cInv = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, cInv, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, 0.0, QUDA_CLOVERPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp index 4763697f2e..4f4bfc9477 100644 --- a/lib/dirac_twisted_clover.cpp +++ b/lib/dirac_twisted_clover.cpp @@ -14,14 +14,14 @@ namespace quda { } DiracTwistedClover::DiracTwistedClover(const DiracParam ¶m, const int nDim) - : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover)), cloverInv(*(param.cloverInv)) + : DiracWilson(param, nDim), mu(param.mu), epsilon(param.epsilon), clover(*(param.clover)) { twistedclover::initConstants(*param.gauge,profile); dslash_aux::initConstants(*param.gauge,profile); } DiracTwistedClover::DiracTwistedClover(const DiracTwistedClover &dirac) - : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover), cloverInv(dirac.cloverInv) + : DiracWilson(dirac), mu(dirac.mu), epsilon(dirac.epsilon), clover(dirac.clover) { twistedclover::initConstants(*dirac.gauge,profile); dslash_aux::initConstants(*dirac.gauge,profile); @@ -35,7 +35,6 @@ namespace quda { { DiracWilson::operator=(dirac); clover = dirac.clover; - cloverInv = dirac.cloverInv; } return *this; @@ -61,12 +60,9 @@ namespace quda { if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS) { - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); + double flavor_mu = in.TwistFlavor() * mu; twistCloverGamma5Cuda(&static_cast(out), &static_cast(in), @@ -78,9 +74,7 @@ namespace quda { flops += 552ll*in.Volume(); delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } else errorQuda("DiracTwistedClover::twistedCloverApply method for flavor doublet is not implemented..\n"); @@ -113,12 +107,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = 2.0 * kappa * in.TwistFlavor() * mu;//for direct twist (must be daggered separately) @@ -136,9 +126,7 @@ namespace quda { } deleteTmp(&tmp, reset); delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } void DiracTwistedClover::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const @@ -172,7 +160,7 @@ namespace quda { void DiracTwistedClover::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor(); - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVER_DIRAC, QUDA_MATPC_INVALID); } DiracTwistedCloverPC::DiracTwistedCloverPC(const DiracTwistedCloverPC &dirac) : DiracTwistedClover(dirac) { } @@ -213,12 +201,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if (in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = -2.0 * kappa * in.TwistFlavor() * mu; //for invert twist (not daggered) @@ -239,9 +223,7 @@ namespace quda { errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n"); } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } // xpay version of the above @@ -257,12 +239,8 @@ namespace quda { twistedclover::setFace(face1,face2); // FIXME: temporary hack maintain C linkage for dslashCuda - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ double a = -2.0 * kappa * in.TwistFlavor() * mu; //for invert twist @@ -287,9 +265,7 @@ namespace quda { errorQuda("Non-degenerate DiracTwistedCloverPC is not implemented \n"); } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif } void DiracTwistedCloverPC::M(ColorSpinorField &out, const ColorSpinorField &in) const @@ -298,12 +274,8 @@ namespace quda { bool reset = newTmp(&tmp1, in); - FullClover *cs = new FullClover(clover); -#ifndef DYNAMIC_CLOVER - FullClover *cI = new FullClover(cloverInv, false); -#else - FullClover *cI = NULL; -#endif + FullClover *cs = new FullClover(clover, false); + FullClover *cI = new FullClover(clover, true); if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS){ if (matpcType == QUDA_MATPC_EVEN_EVEN) { @@ -352,9 +324,7 @@ namespace quda { } delete cs; -#ifndef DYNAMIC_CLOVER delete cI; -#endif deleteTmp(&tmp1, reset); } @@ -452,6 +422,6 @@ namespace quda { void DiracTwistedCloverPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor(); - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, &cloverInv, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, &clover, kappa, a, QUDA_TWISTED_CLOVERPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_twisted_mass.cpp b/lib/dirac_twisted_mass.cpp index 25088fbeb8..36c404067b 100644 --- a/lib/dirac_twisted_mass.cpp +++ b/lib/dirac_twisted_mass.cpp @@ -189,7 +189,7 @@ namespace quda { void DiracTwistedMass::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = 2.0 * kappa * mu * T.Vectors().TwistFlavor(); cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASS_DIRAC, QUDA_MATPC_INVALID); } DiracTwistedMassPC::DiracTwistedMassPC(const DiracTwistedMassPC &dirac) : DiracTwistedMass(dirac) { } @@ -548,6 +548,6 @@ namespace quda { void DiracTwistedMassPC::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { double a = -2.0 * kappa * mu * T.Vectors().TwistFlavor(); cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, a, QUDA_TWISTED_MASSPC_DIRAC, matpcType); } } // namespace quda diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp index 5b66da3077..cc27de9134 100644 --- a/lib/dirac_wilson.cpp +++ b/lib/dirac_wilson.cpp @@ -158,7 +158,7 @@ namespace quda { void DiracWilson::createCoarseOp(GaugeField &Y, GaugeField &X, GaugeField &Xinv, GaugeField &Yhat, const Transfer &T) const { cudaCloverField *c = NULL; - CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID); + CoarseOp(Y, X, Xinv, Yhat, T, *gauge, c, kappa, 0.0, QUDA_WILSON_DIRAC, QUDA_MATPC_INVALID); } DiracWilsonPC::DiracWilsonPC(const DiracParam ¶m) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index cdebe9b9aa..4c7925561b 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -130,10 +130,6 @@ cudaCloverField *cloverPrecise = NULL; cudaCloverField *cloverSloppy = NULL; cudaCloverField *cloverPrecondition = NULL; -cudaCloverField *cloverInvPrecise = NULL; -cudaCloverField *cloverInvSloppy = NULL; -cudaCloverField *cloverInvPrecondition = NULL; - cudaGaugeField *momResident = NULL; cudaGaugeField *extendedGaugeResident = NULL; @@ -753,9 +749,8 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) CloverFieldParam clover_param; CloverField *in=NULL; - CloverField *inInv=NULL; - if(!device_calc){ + if (!device_calc) { // create a param for the cpu clover field profileClover.TPSTART(QUDA_PROFILE_INIT); CloverFieldParam cpuParam; @@ -776,30 +771,15 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { cpuParam.direct = true; - cpuParam.inverse = false; - cpuParam.cloverInv = NULL; + cpuParam.inverse = true; cpuParam.clover = h_clover; - cpuParam.twisted = true; - cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); - cpuParam.cloverInv = h_clovinv; - cpuParam.clover = NULL; cpuParam.twisted = true; - cpuParam.direct = false; - cpuParam.inverse = true; cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - - inInv = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); - } else { - in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); } + in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? + static_cast(new cpuCloverField(cpuParam)) : + static_cast(new cudaCloverField(cpuParam)); clover_param.nDim = 4; for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; @@ -813,18 +793,14 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; clover_param.direct = true; - clover_param.inverse = false; clover_param.twisted = true; - cloverPrecise = new cudaCloverField(clover_param); #ifndef DYNAMIC_CLOVER - clover_param.direct = false; clover_param.inverse = true; - clover_param.twisted = true; - cloverInvPrecise = new cudaCloverField(clover_param); +#else + clover_param.inverse = false; #endif - } else { - cloverPrecise = new cudaCloverField(clover_param); } + cloverPrecise = new cudaCloverField(clover_param); profileClover.TPSTOP(QUDA_PROFILE_INIT); @@ -832,8 +808,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { cloverPrecise->copy(*in, false); #ifndef DYNAMIC_CLOVER - cloverInvPrecise->copy(*in, true); - cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); #endif } else { cloverPrecise->copy(*in, h_clovinv ? true : false); @@ -847,10 +822,10 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) #ifndef DYNAMIC_CLOVER if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverInvert(*cloverInvPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); if (inv_param->compute_clover_trlog) { - inv_param->trlogA[0] = cloverInvPrecise->TrLog()[0]; - inv_param->trlogA[1] = cloverInvPrecise->TrLog()[1]; + inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; + inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; } } #endif @@ -868,14 +843,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) } } -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) - inv_param->cloverGiB = cloverPrecise->GBytes(); - else - inv_param->cloverGiB = cloverPrecise->GBytes() + cloverInvPrecise->GBytes(); -#else inv_param->cloverGiB = cloverPrecise->GBytes(); -#endif clover_param.norm = 0; clover_param.invNorm = 0; @@ -896,30 +864,22 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; clover_param.twisted = true; + clover_param.direct = true; #ifndef DYNAMIC_CLOVER - clover_param.direct = false; clover_param.inverse = true; - cloverInvSloppy = new cudaCloverField(clover_param); - cloverInvSloppy->copy(*cloverInvPrecise, true); - clover_param.direct = true; +#else clover_param.inverse = false; - inv_param->cloverGiB += cloverInvSloppy->GBytes(); #endif cloverSloppy = new cudaCloverField(clover_param); - cloverSloppy->copy(*cloverPrecise); - inv_param->cloverGiB += cloverSloppy->GBytes(); + cloverSloppy->copy(*cloverPrecise, clover_param.inverse); } else { cloverSloppy = new cudaCloverField(clover_param); cloverSloppy->copy(*cloverPrecise); - inv_param->cloverGiB += cloverSloppy->GBytes(); } + inv_param->cloverGiB += cloverSloppy->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { cloverSloppy = cloverPrecise; -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - cloverInvSloppy = cloverInvPrecise; -#endif } // create the mirror preconditioner clover field @@ -929,37 +889,28 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) clover_param.setPrecision(inv_param->clover_cuda_prec_precondition); if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { clover_param.direct = true; - clover_param.inverse = false; - cloverPrecondition = new cudaCloverField(clover_param); - cloverPrecondition->copy(*cloverSloppy); - inv_param->cloverGiB += cloverPrecondition->GBytes(); #ifndef DYNAMIC_CLOVER - clover_param.direct = false; clover_param.inverse = true; - clover_param.twisted = true; - cloverInvPrecondition = new cudaCloverField(clover_param); - cloverInvPrecondition->copy(*cloverInvSloppy, true); - inv_param->cloverGiB += cloverInvPrecondition->GBytes(); +#else + clover_param.inverse = false; #endif + cloverPrecondition = new cudaCloverField(clover_param); + cloverPrecondition->copy(*cloverSloppy, clover_param.inverse); } else { cloverPrecondition = new cudaCloverField(clover_param); cloverPrecondition->copy(*cloverSloppy); - inv_param->cloverGiB += cloverPrecondition->GBytes(); } + inv_param->cloverGiB += cloverPrecondition->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { cloverPrecondition = cloverSloppy; -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - cloverInvPrecondition = cloverInvSloppy; -#endif } // need to copy back the inverse field for twisted-clover tests if (h_clovinv && !device_calc && (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { // copy the inverted clover term into host application order on the device clover_param.setPrecision(inv_param->clover_cpu_prec); - clover_param.direct = false; + clover_param.direct = true; clover_param.inverse = true; clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; clover_param.twisted = true; @@ -969,10 +920,10 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) #ifndef DYNAMIC_CLOVER clover_param.order = inv_param->clover_order; cudaCloverField hack(clover_param); - hack.copy(*cloverInvPrecise); + hack.copy(*cloverPrecise); #else cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack - hackOfTheHack->copy(*cloverPrecise); + hackOfTheHack->copy(*cloverPrecise, false); cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); clover_param.order = inv_param->clover_order; cudaCloverField hack(clover_param); @@ -983,8 +934,8 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) // copy the field into the host application's clover field profileClover.TPSTART(QUDA_PROFILE_D2H); - qudaMemcpy((char*)(inInv->V(true)), (char*)(hack.V(true)), - inInv->Bytes(), cudaMemcpyDeviceToHost); + qudaMemcpy((char*)(in->V(true)), (char*)(hack.V(true)), + in->Bytes(), cudaMemcpyDeviceToHost); profileClover.TPSTOP(QUDA_PROFILE_D2H); checkCudaError(); @@ -1013,10 +964,8 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) checkCudaError(); } - if(!device_calc) - { + if (!device_calc) { if (in) delete in; // delete object referencing input field - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inInv) delete inInv; } popVerbosity(); @@ -1204,16 +1153,6 @@ void freeCloverQuda(void) cloverPrecondition = NULL; cloverSloppy = NULL; cloverPrecise = NULL; - - if (cloverInvPrecise != NULL) { - if (cloverInvPrecondition != cloverInvSloppy && cloverInvPrecondition) delete cloverInvPrecondition; - if (cloverInvSloppy != cloverInvPrecise && cloverInvSloppy) delete cloverInvSloppy; - if (cloverInvPrecise) delete cloverInvPrecise; - - cloverInvPrecondition = NULL; - cloverInvSloppy = NULL; - cloverInvPrecise = NULL; - } } void endQuda(void) @@ -1388,7 +1327,6 @@ namespace quda { diracParam.fatGauge = gaugeFatPrecise; diracParam.longGauge = gaugeLongPrecise; diracParam.clover = cloverPrecise; - diracParam.cloverInv = cloverInvPrecise; diracParam.kappa = kappa; diracParam.mass = inv_param->mass; diracParam.m5 = inv_param->m5; @@ -1406,7 +1344,6 @@ namespace quda { diracParam.fatGauge = gaugeFatSloppy; diracParam.longGauge = gaugeLongSloppy; diracParam.clover = cloverSloppy; - diracParam.cloverInv = cloverInvSloppy; for (int i=0; i<4; i++) { diracParam.commDim[i] = 1; // comms are always on @@ -1429,7 +1366,6 @@ namespace quda { diracParam.longGauge = gaugeLongPrecondition; } diracParam.clover = cloverPrecondition; - diracParam.cloverInv = cloverInvPrecondition; for (int i=0; i<4; i++) { diracParam.commDim[i] = comms ? 1 : 0; @@ -1554,8 +1490,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); pushVerbosity(inv_param->verbosity); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); @@ -1778,8 +1712,6 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || @@ -1851,8 +1783,6 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if (gaugePrecise == NULL) errorQuda("Gauge field not allocated"); if (cloverPrecise == NULL && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); - if (cloverInvPrecise == NULL && inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || @@ -2931,7 +2861,6 @@ void incrementalEigQuda(void *_h_x, void *_h_b, QudaInvertParam *param, void *_h diracHalfPrecParam.longGauge = gaugeLongPrecondition; diracHalfPrecParam.clover = cloverPrecondition; - diracHalfPrecParam.cloverInv = cloverInvPrecondition; for (int i=0; i<4; i++) { diracHalfPrecParam.commDim[i] = 1; // comms are on. @@ -3582,22 +3511,17 @@ void createCloverQuda(QudaInvertParam* invertParam) cloverParam.create = QUDA_NULL_FIELD_CREATE; cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET; cloverParam.setPrecision(invertParam->cuda_prec); - if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) - { - + if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { cloverParam.twisted = true; cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu; cloverParam.direct = true; - cloverParam.inverse = false; - cloverPrecise = new cudaCloverField(cloverParam); #ifndef DYNAMIC_CLOVER cloverParam.inverse = true; - cloverParam.direct = false; - cloverInvPrecise = new cudaCloverField(cloverParam); //FIXME Only with tmClover +#else + cloverParam.inverse = false; #endif - } else { - cloverPrecise = new cudaCloverField(cloverParam); } + cloverPrecise = new cudaCloverField(cloverParam); } int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction @@ -3625,36 +3549,11 @@ void createCloverQuda(QudaInvertParam* invertParam) // copy gaugePrecise into the extended device gauge field copyExtendedGauge(*cudaGaugeExtended, *gaugePrecise, QUDA_CUDA_FIELD_LOCATION); -#if 1 - profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); - profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS); - cudaGaugeExtended->exchangeExtendedGhost(R,true); - profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS); -#else - - GaugeFieldParam gParam(gaugePrecise->X(), gaugePrecise->Precision(), QUDA_RECONSTRUCT_NO, - pad, QUDA_VECTOR_GEOMETRY, QUDA_GHOST_EXCHANGE_NO); - gParam.create = QUDA_ZERO_FIELD_CREATE; - gParam.order = QUDA_MILC_GAUGE_ORDER; - gParam.siteSubset = QUDA_FULL_SITE_SUBSET; - gParam.t_boundary = gaugePrecise->TBoundary(); - gParam.nFace = 1; - - // create an extended gauge field on the host - for(int dir=0; dir<4; ++dir) gParam.x[dir] += 4; - cpuGaugeField cpuGaugeExtended(gParam); - cudaGaugeExtended->saveCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION); profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); - // communicate data profileCloverCreate.TPSTART(QUDA_PROFILE_COMMS); - //exchange_cpu_sitelink_ex(const_cast(gaugePrecise->X()), R, (void**)cpuGaugeExtended.Gauge_p(), - // cpuGaugeExtended.Order(),cpuGaugeExtended.Precision(), 0, 4); - cpuGaugeExtended.exchangeExtendedGhost(R,true); - - cudaGaugeExtended->loadCPUField(cpuGaugeExtended, QUDA_CPU_FIELD_LOCATION); + cudaGaugeExtended->exchangeExtendedGhost(R,true); profileCloverCreate.TPSTOP(QUDA_PROFILE_COMMS); -#endif } #ifdef MULTI_GPU @@ -3663,7 +3562,6 @@ void createCloverQuda(QudaInvertParam* invertParam) GaugeField *gauge = gaugePrecise; #endif - profileCloverCreate.TPSTART(QUDA_PROFILE_INIT); // create the Fmunu field GaugeFieldParam tensorParam(gaugePrecise->X(), gauge->Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY); @@ -3674,16 +3572,8 @@ void createCloverQuda(QudaInvertParam* invertParam) profileCloverCreate.TPSTOP(QUDA_PROFILE_INIT); profileCloverCreate.TPSTART(QUDA_PROFILE_COMPUTE); - computeFmunu(Fmunu, *gauge, QUDA_CUDA_FIELD_LOCATION); computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); - -#ifndef DYNAMIC_CLOVER - if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - computeClover(*cloverInvPrecise, Fmunu, invertParam->clover_coeff, QUDA_CUDA_FIELD_LOCATION); // FIXME only with tmClover - } -#endif - profileCloverCreate.TPSTOP(QUDA_PROFILE_COMPUTE); profileCloverCreate.TPSTOP(QUDA_PROFILE_TOTAL); From 3a8139cdac58ebf269b8864aac59e25805a71daf Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 17 Jun 2016 23:31:10 -0700 Subject: [PATCH 15/24] Added default and copy constructors for CloverFieldParam --- include/clover_field.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/clover_field.h b/include/clover_field.h index 6f16c99154..1c1f2067f4 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -25,6 +25,16 @@ namespace quda { order = (precision == QUDA_DOUBLE_PRECISION) ? QUDA_FLOAT2_CLOVER_ORDER : QUDA_FLOAT4_CLOVER_ORDER; } + + CloverFieldParam() : LatticeFieldParam(), + direct(true), inverse(true), clover(nullptr), norm(nullptr), + cloverInv(nullptr), invNorm(nullptr), twisted(false), mu2(0.0) { } + + CloverFieldParam(const CloverFieldParam ¶m) : LatticeFieldParam(param), + direct(param.direct), inverse(param.inverse), + clover(param.clover), norm(param.norm), + cloverInv(param.cloverInv), invNorm(param.invNorm), + twisted(param.twisted), mu2(param.mu2) { } }; std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param); From b1d088a530738bcdb2bec39cec82fc838c1dd722 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 17 Jun 2016 23:37:53 -0700 Subject: [PATCH 16/24] Significant cleanup of loadCloverQuda to minimize redundant code. Added new parameters QudaInvertParam::compute_clover_invert and QudaInvertParam::return_clover_invert to explicitly request clover inversion computation and copy back to the host. --- include/quda.h | 3 + lib/check_params.h | 9 +- lib/interface_quda.cpp | 275 ++++++++++++----------------------------- 3 files changed, 92 insertions(+), 195 deletions(-) diff --git a/include/quda.h b/include/quda.h index 7b22576d35..4b9d0efda3 100644 --- a/include/quda.h +++ b/include/quda.h @@ -183,6 +183,9 @@ extern "C" { int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ + int compute_clover_inverse; /**< Whether to compute the clover inverse field */ + int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ + QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ int sp_pad; /**< The padding to use for the fermion fields */ diff --git a/lib/check_params.h b/lib/check_params.h index a10b3d63cb..ef8d0d47f3 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -355,9 +355,14 @@ void printQudaInvertParam(QudaInvertParam *param) { #if defined INIT_PARAM P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION); P(compute_clover_trlog, 0); + P(compute_clover_inverse, 0); + P(return_clover_inverse, 0); #else - if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION) - param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy; + if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION) + param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy; + P(compute_clover_trlog, QUDA_INVALID_PRECISION); + P(compute_clover_inverse, QUDA_INVALID_PRECISION); + P(return_clover_inverse, QUDA_INVALID_PRECISION); #endif P(clover_order, QUDA_INVALID_CLOVER_ORDER); P(cl_pad, INVALID_INT); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 4c7925561b..f19b91be4a 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -704,20 +704,15 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (!initialized) errorQuda("QUDA not initialized"); if (!h_clover && !h_clovinv) { - if(inv_param->clover_coeff != 0){ + if (inv_param->clover_coeff != 0) { device_calc = true; - }else{ - errorQuda("loadCloverQuda() called with neither clover term nor inverse"); + } else { + errorQuda("loadCloverQuda() called with neither clover term nor inverse and clover coefficient not set"); } } - - if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) { - errorQuda("Half precision not supported on CPU"); - } - if (gaugePrecise == NULL) { - errorQuda("Gauge field must be loaded before clover"); - } + if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) errorQuda("Half precision not supported on CPU"); + if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover"); if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) { errorQuda("Wrong dslash_type in loadCloverQuda()"); } @@ -747,135 +742,84 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) warningQuda("Uninverted clover term not loaded"); } + bool twisted = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? true : false; +#ifdef DYNAMIC_CLOVER + bool dynamic_clover = twisted ? true : false; // dynamic clover only supported on twisted clover currently +#else + bool dynamic_clover = false; +#endif + CloverFieldParam clover_param; - CloverField *in=NULL; + clover_param.nDim = 4; + clover_param.twisted = twisted; + clover_param.mu2 = twisted ? 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu : 0.0; + clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; + for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; + clover_param.pad = inv_param->cl_pad; + clover_param.create = QUDA_NULL_FIELD_CREATE; + clover_param.norm = nullptr; + clover_param.invNorm = nullptr; + clover_param.setPrecision(inv_param->clover_cuda_prec); + clover_param.direct = h_clover || device_calc ? true : false; + clover_param.inverse = (h_clovinv || pc_solve) && !dynamic_clover ? true : false; + + cloverPrecise = new cudaCloverField(clover_param); + + CloverField *in = nullptr; if (!device_calc) { // create a param for the cpu clover field profileClover.TPSTART(QUDA_PROFILE_INIT); - CloverFieldParam cpuParam; - cpuParam.nDim = 4; - for (int i=0; i<4; i++) cpuParam.x[i] = gaugePrecise->X()[i]; - cpuParam.precision = inv_param->clover_cpu_prec; - cpuParam.order = inv_param->clover_order; - cpuParam.direct = h_clover ? true : false; - cpuParam.inverse = h_clovinv ? true : false; - cpuParam.clover = h_clover; - cpuParam.norm = 0; - cpuParam.cloverInv = h_clovinv; - cpuParam.invNorm = 0; - cpuParam.create = QUDA_REFERENCE_FIELD_CREATE; - cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET; - cpuParam.twisted = false; - cpuParam.mu2 = 0.; - - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cpuParam.direct = true; - cpuParam.inverse = true; - cpuParam.clover = h_clover; - cpuParam.cloverInv = h_clovinv; - cpuParam.twisted = true; - cpuParam.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - } + CloverFieldParam inParam(clover_param); + inParam.precision = inv_param->clover_cpu_prec; + inParam.order = inv_param->clover_order; + inParam.direct = h_clover ? true : false; + inParam.inverse = h_clovinv ? true : false; + inParam.clover = h_clover; + inParam.cloverInv = h_clovinv; + inParam.create = QUDA_REFERENCE_FIELD_CREATE; + in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuCloverField(cpuParam)) : - static_cast(new cudaCloverField(cpuParam)); - - clover_param.nDim = 4; - for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; - clover_param.setPrecision(inv_param->clover_cuda_prec); - clover_param.pad = inv_param->cl_pad; - clover_param.direct = h_clover ? true : false; - clover_param.inverse = (h_clovinv || pc_solve) ? true : false; - clover_param.create = QUDA_NULL_FIELD_CREATE; - clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; - - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - clover_param.direct = true; - clover_param.twisted = true; -#ifndef DYNAMIC_CLOVER - clover_param.inverse = true; -#else - clover_param.inverse = false; -#endif - } - cloverPrecise = new cudaCloverField(clover_param); + static_cast(new cpuCloverField(inParam)) : + static_cast(new cudaCloverField(inParam)); profileClover.TPSTOP(QUDA_PROFILE_INIT); profileClover.TPSTART(QUDA_PROFILE_H2D); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverPrecise->copy(*in, false); -#ifndef DYNAMIC_CLOVER - cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); -#endif - } else { - cloverPrecise->copy(*in, h_clovinv ? true : false); - } - + cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false); profileClover.TPSTOP(QUDA_PROFILE_H2D); } else { profileClover.TPSTART(QUDA_PROFILE_COMPUTE); - createCloverQuda(inv_param); - -#ifndef DYNAMIC_CLOVER - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); - if (inv_param->compute_clover_trlog) { - inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; - inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; - } - } -#endif profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); } // inverted clover term is required when applying preconditioned operator - if ((!h_clovinv && pc_solve) && inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { + if ((!h_clovinv || inv_param->compute_clover_inverse) && pc_solve) { profileClover.TPSTART(QUDA_PROFILE_COMPUTE); - cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); - profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); - if (inv_param->compute_clover_trlog) { - inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; - inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + if (!dynamic_clover) { + cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + if (inv_param->compute_clover_trlog) { + inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; + inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + } } + profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); } inv_param->cloverGiB = cloverPrecise->GBytes(); - clover_param.norm = 0; - clover_param.invNorm = 0; - clover_param.mu2 = 0.; - clover_param.nDim = 4; - for(int dir=0; dir<4; ++dir) clover_param.x[dir] = gaugePrecise->X()[dir]; - clover_param.pad = inv_param->cl_pad; - clover_param.siteSubset = QUDA_FULL_SITE_SUBSET; - clover_param.create = QUDA_NULL_FIELD_CREATE; clover_param.direct = true; - clover_param.inverse = true; + clover_param.inverse = dynamic_clover ? false : true; // create the mirror sloppy clover field if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { profileClover.TPSTART(QUDA_PROFILE_INIT); + clover_param.setPrecision(inv_param->clover_cuda_prec_sloppy); + cloverSloppy = new cudaCloverField(clover_param); + cloverSloppy->copy(*cloverPrecise, clover_param.inverse); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - clover_param.twisted = true; - clover_param.direct = true; -#ifndef DYNAMIC_CLOVER - clover_param.inverse = true; -#else - clover_param.inverse = false; -#endif - cloverSloppy = new cudaCloverField(clover_param); - cloverSloppy->copy(*cloverPrecise, clover_param.inverse); - } else { - cloverSloppy = new cudaCloverField(clover_param); - cloverSloppy->copy(*cloverPrecise); - } inv_param->cloverGiB += cloverSloppy->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { @@ -886,87 +830,57 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition && inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) { profileClover.TPSTART(QUDA_PROFILE_INIT); + clover_param.setPrecision(inv_param->clover_cuda_prec_precondition); - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover_param.direct = true; -#ifndef DYNAMIC_CLOVER - clover_param.inverse = true; -#else - clover_param.inverse = false; -#endif - cloverPrecondition = new cudaCloverField(clover_param); - cloverPrecondition->copy(*cloverSloppy, clover_param.inverse); - } else { - cloverPrecondition = new cudaCloverField(clover_param); - cloverPrecondition->copy(*cloverSloppy); - } + cloverPrecondition = new cudaCloverField(clover_param); + cloverPrecondition->copy(*cloverSloppy, clover_param.inverse); + inv_param->cloverGiB += cloverPrecondition->GBytes(); profileClover.TPSTOP(QUDA_PROFILE_INIT); } else { cloverPrecondition = cloverSloppy; } - // need to copy back the inverse field for twisted-clover tests - if (h_clovinv && !device_calc && (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { - // copy the inverted clover term into host application order on the device - clover_param.setPrecision(inv_param->clover_cpu_prec); - clover_param.direct = true; - clover_param.inverse = true; - clover_param.mu2 = 4.*inv_param->kappa*inv_param->kappa*inv_param->mu*inv_param->mu; - clover_param.twisted = true; - - // this isn't really "epilogue" but this label suffices - profileClover.TPSTART(QUDA_PROFILE_EPILOGUE); -#ifndef DYNAMIC_CLOVER - clover_param.order = inv_param->clover_order; - cudaCloverField hack(clover_param); - hack.copy(*cloverPrecise); -#else - cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack - hackOfTheHack->copy(*cloverPrecise, false); - cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); - clover_param.order = inv_param->clover_order; - cudaCloverField hack(clover_param); - hack.copy(*hackOfTheHack); - delete hackOfTheHack; -#endif - profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE); - - // copy the field into the host application's clover field - profileClover.TPSTART(QUDA_PROFILE_D2H); - qudaMemcpy((char*)(in->V(true)), (char*)(hack.V(true)), - in->Bytes(), cudaMemcpyDeviceToHost); - profileClover.TPSTOP(QUDA_PROFILE_D2H); - - checkCudaError(); - } - - // need to copy back the odd inverse field into the application clover field - if (!h_clovinv && pc_solve && !device_calc) { + // if requested, copy back the inverse field + if (h_clovinv && inv_param->compute_clover_inverse && inv_param->return_clover_inverse) { // copy the inverted clover term into host application order on the device clover_param.setPrecision(inv_param->clover_cpu_prec); clover_param.direct = false; clover_param.inverse = true; - clover_param.order = inv_param->clover_order; // this isn't really "epilogue" but this label suffices profileClover.TPSTART(QUDA_PROFILE_EPILOGUE); - cudaCloverField hack(clover_param); - hack.copy(*cloverPrecise); + cudaCloverField *hack = nullptr; + if (!dynamic_clover) { + clover_param.order = inv_param->clover_order; + hack = new cudaCloverField(clover_param); + hack->copy(*cloverPrecise); // FIXME this will lead to an initial redundant copy that is overwritten + } else { + cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack + hackOfTheHack->copy(*cloverPrecise, false); + cloverInvert(*hackOfTheHack, inv_param->compute_clover_trlog, QUDA_CUDA_FIELD_LOCATION); + if (inv_param->compute_clover_trlog) { + inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; + inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; + } + clover_param.order = inv_param->clover_order; + hack = new cudaCloverField(clover_param); + hack->copy(*hackOfTheHack); // FIXME this will lead to an initial redundant copy that is overwritten + delete hackOfTheHack; + } profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE); - // copy the odd components into the host application's clover field + // copy the field into the host application's clover field profileClover.TPSTART(QUDA_PROFILE_D2H); - qudaMemcpy((char*)(in->V(false))+in->Bytes()/2, (char*)(hack.V(true))+hack.Bytes()/2, - in->Bytes()/2, cudaMemcpyDeviceToHost); + qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), + in->Bytes(), cudaMemcpyDeviceToHost); profileClover.TPSTOP(QUDA_PROFILE_D2H); + delete hack; checkCudaError(); } - if (!device_calc) { - if (in) delete in; // delete object referencing input field - } + if (in) delete in; // delete object referencing input field popVerbosity(); @@ -3497,32 +3411,7 @@ void createCloverQuda(QudaInvertParam* invertParam) { profileCloverCreate.TPSTART(QUDA_PROFILE_TOTAL); profileCloverCreate.TPSTART(QUDA_PROFILE_INIT); - if(!cloverPrecise){ - CloverFieldParam cloverParam; - cloverParam.nDim = 4; - for(int dir=0; dir<4; ++dir) cloverParam.x[dir] = gaugePrecise->X()[dir]; - cloverParam.setPrecision(invertParam->clover_cuda_prec); - cloverParam.pad = invertParam->cl_pad; - cloverParam.direct = true; - cloverParam.inverse = true; - cloverParam.norm = 0; - cloverParam.invNorm = 0; - cloverParam.twisted = false; - cloverParam.create = QUDA_NULL_FIELD_CREATE; - cloverParam.siteSubset = QUDA_FULL_SITE_SUBSET; - cloverParam.setPrecision(invertParam->cuda_prec); - if (invertParam->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - cloverParam.twisted = true; - cloverParam.mu2 = 4.*invertParam->kappa*invertParam->kappa*invertParam->mu*invertParam->mu; - cloverParam.direct = true; -#ifndef DYNAMIC_CLOVER - cloverParam.inverse = true; -#else - cloverParam.inverse = false; -#endif - } - cloverPrecise = new cudaCloverField(cloverParam); - } + if (!cloverPrecise) errorQuda("Clover field not allocated"); int R[4] = {2,2,2,2}; // radius of the extended region in each dimension / direction int y[4]; From 0a64e6dffedc29c2abbdd1ae5e827d222a452ce0 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 17 Jun 2016 23:43:41 -0700 Subject: [PATCH 17/24] In dslash_test, clover now matches twisted-clover in that the clover field is inverted and returned when checking for correctness --- tests/dslash_test.cpp | 41 ++++++++++------------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index eacefcf315..691f1794b4 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -144,6 +144,7 @@ void init(int argc, char **argv) { inv_param.Ls = (inv_param.twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) ? Ls : 2; + inv_param.solve_type = (test_type == 2 || test_type == 4) ? QUDA_DIRECT_SOLVE : QUDA_DIRECT_PC_SOLVE; inv_param.matpc_type = matpc_type; inv_param.dagger = dagger; not_dagger = (QudaDagType)((dagger + 1)%2); @@ -231,20 +232,7 @@ void init(int argc, char **argv) { inv_param.dslash_type = dslash_type; - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - inv_param.clover_cpu_prec = cpu_prec; - inv_param.clover_cuda_prec = cuda_prec; - inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; - inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; - inv_param.clover_coeff = 1.5*inv_param.kappa; - //if (test_type > 0) { - hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); - hostCloverInv = hostClover; // fake it - /*} else { - hostClover = NULL; - hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); - }*/ - } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; @@ -326,22 +314,10 @@ void init(int argc, char **argv) { spinor->Source(QUDA_RANDOM_SOURCE, 0); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { - double norm = 1.0; // clover components are random numbers in the range (-norm, norm) - double diag = 1.0; // constant added to the diagonal + double norm = 0.1; // clover components are random numbers in the range (-norm, norm) + double diag = 1.0; // constant added to the diagonal + construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); - if (test_type == 2 || test_type == 4) { - construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); - } else { - construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); - } - } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - // We are not so aggressive with the random clover term here because we need to invert it, so it must make some sense - double norm = 0.1; // clover components are random numbers in the range (-norm, norm) - double diag = 1.0; // constant added to the diagonal - - construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); - } printfQuda("done.\n"); fflush(stdout); initQuda(device); @@ -360,6 +336,9 @@ void init(int argc, char **argv) { if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { printfQuda("Sending clover field to GPU\n"); + inv_param.compute_clover_inverse = 1; + inv_param.return_clover_inverse = 1; + loadCloverQuda(hostClover, hostCloverInv, &inv_param); } @@ -457,8 +436,8 @@ void end() { delete spinorTmp; for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); - if((dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (dslash_type == QUDA_TWISTED_CLOVER_DSLASH)) { - if (hostClover != hostCloverInv && hostClover) free(hostClover); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + free(hostClover); free(hostCloverInv); } endQuda(); From 8ae13732ff2d187619cad2b8c3a67c8840f02b08 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 20:01:05 -0700 Subject: [PATCH 18/24] Added new parameters QudaInvertParam::compute_clover and QudaInvertParam::return_clover which are used to instruct loadcloverQuda whether to compute the clover field or to download it, and also once computed, whether to copy it back to the host application. Default is to no to both to repsect previous interface behaviour. --- include/quda.h | 2 ++ lib/check_params.h | 4 ++++ lib/interface_quda.cpp | 42 ++++++++++++++++++++++++------------------ 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/include/quda.h b/include/quda.h index 4b9d0efda3..1b38429d14 100644 --- a/include/quda.h +++ b/include/quda.h @@ -183,7 +183,9 @@ extern "C" { int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ + int compute_clover; /**< Whether to compute the clover field */ int compute_clover_inverse; /**< Whether to compute the clover inverse field */ + int return_clover; /**< Whether to copy back the clover matrix field */ int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ diff --git a/lib/check_params.h b/lib/check_params.h index ef8d0d47f3..d04220a5a2 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -355,13 +355,17 @@ void printQudaInvertParam(QudaInvertParam *param) { #if defined INIT_PARAM P(clover_cuda_prec_precondition, QUDA_INVALID_PRECISION); P(compute_clover_trlog, 0); + P(compute_clover, 0); P(compute_clover_inverse, 0); + P(return_clover, 0); P(return_clover_inverse, 0); #else if (param->clover_cuda_prec_precondition == QUDA_INVALID_PRECISION) param->clover_cuda_prec_precondition = param->clover_cuda_prec_sloppy; P(compute_clover_trlog, QUDA_INVALID_PRECISION); + P(compute_clover, QUDA_INVALID_PRECISION); P(compute_clover_inverse, QUDA_INVALID_PRECISION); + P(return_clover, QUDA_INVALID_PRECISION); P(return_clover_inverse, QUDA_INVALID_PRECISION); #endif P(clover_order, QUDA_INVALID_CLOVER_ORDER); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index f19b91be4a..64dab6ebd6 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -695,6 +695,8 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { + if (!gaugePrecise) errorQuda("Cannot call loadCloverQuda with no resident gauge field"); + profileClover.TPSTART(QUDA_PROFILE_TOTAL); bool device_calc = false; // calculate clover and inverse on the device? @@ -703,18 +705,16 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (!initialized) errorQuda("QUDA not initialized"); - if (!h_clover && !h_clovinv) { - if (inv_param->clover_coeff != 0) { - device_calc = true; - } else { - errorQuda("loadCloverQuda() called with neither clover term nor inverse and clover coefficient not set"); - } + if ( (!h_clover && !h_clovinv) || inv_param->compute_clover ) { + device_calc = true; + if (inv_param->clover_coeff == 0.0) errorQuda("called with neither clover term nor inverse and clover coefficient not set"); + if (gaugePrecise->Anisotropy() != 1.0) errorQuda("cannot compute anisotropic clover field"); } if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) errorQuda("Half precision not supported on CPU"); if (gaugePrecise == NULL) errorQuda("Gauge field must be loaded before clover"); if ((inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) && (inv_param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH)) { - errorQuda("Wrong dslash_type in loadCloverQuda()"); + errorQuda("Wrong dslash_type %d in loadCloverQuda()", inv_param->dslash_type); } // determines whether operator is preconditioned when calling invertQuda() @@ -767,7 +767,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) CloverField *in = nullptr; - if (!device_calc) { + if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) { // create a param for the cpu clover field profileClover.TPSTART(QUDA_PROFILE_INIT); CloverFieldParam inParam(clover_param); @@ -778,13 +778,13 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) inParam.clover = h_clover; inParam.cloverInv = h_clovinv; inParam.create = QUDA_REFERENCE_FIELD_CREATE; - in = (inv_param->clover_location == QUDA_CPU_FIELD_LOCATION) ? static_cast(new cpuCloverField(inParam)) : static_cast(new cudaCloverField(inParam)); - profileClover.TPSTOP(QUDA_PROFILE_INIT); + } + if (!device_calc) { profileClover.TPSTART(QUDA_PROFILE_H2D); cloverPrecise->copy(*in, h_clovinv && !inv_param->compute_clover_inverse ? true : false); profileClover.TPSTOP(QUDA_PROFILE_H2D); @@ -841,12 +841,14 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) cloverPrecondition = cloverSloppy; } - // if requested, copy back the inverse field - if (h_clovinv && inv_param->compute_clover_inverse && inv_param->return_clover_inverse) { + // if requested, copy back the clover / inverse field + if ( inv_param->return_clover || inv_param->return_clover_inverse ) { + if (!h_clover && !h_clovinv) errorQuda("Requested clover field return but no clover host pointers set"); + // copy the inverted clover term into host application order on the device clover_param.setPrecision(inv_param->clover_cpu_prec); - clover_param.direct = false; - clover_param.inverse = true; + clover_param.direct = (h_clover && inv_param->return_clover); + clover_param.inverse = (h_clovinv && inv_param->return_clover_inverse); // this isn't really "epilogue" but this label suffices profileClover.TPSTART(QUDA_PROFILE_EPILOGUE); @@ -854,7 +856,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (!dynamic_clover) { clover_param.order = inv_param->clover_order; hack = new cudaCloverField(clover_param); - hack->copy(*cloverPrecise); // FIXME this will lead to an initial redundant copy that is overwritten + hack->copy(*cloverPrecise); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse } else { cudaCloverField *hackOfTheHack = new cudaCloverField(clover_param); // Hack of the hack hackOfTheHack->copy(*cloverPrecise, false); @@ -865,15 +867,19 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) } clover_param.order = inv_param->clover_order; hack = new cudaCloverField(clover_param); - hack->copy(*hackOfTheHack); // FIXME this will lead to an initial redundant copy that is overwritten + hack->copy(*hackOfTheHack); // FIXME this can lead to an redundant copies if we're not copying back direct + inverse delete hackOfTheHack; } profileClover.TPSTOP(QUDA_PROFILE_EPILOGUE); // copy the field into the host application's clover field profileClover.TPSTART(QUDA_PROFILE_D2H); - qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), - in->Bytes(), cudaMemcpyDeviceToHost); + if (inv_param->return_clover) { + qudaMemcpy((char*)(in->V(false)), (char*)(hack->V(false)), in->Bytes(), cudaMemcpyDeviceToHost); + } + if (inv_param->return_clover_inverse) { + qudaMemcpy((char*)(in->V(true)), (char*)(hack->V(true)), in->Bytes(), cudaMemcpyDeviceToHost); + } profileClover.TPSTOP(QUDA_PROFILE_D2H); delete hack; From d2207e2b7a1a285cde3b6f9c5cdf71eb26740920 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 20:03:39 -0700 Subject: [PATCH 19/24] Added new command-line parameters to unit tests for enabling QUDA-side computation of the clover field (and inverse) and for setting the clover coefficient. Updated dslash_test for this: if --compute-clover true is given then the clover field and its inverse is computed in QUDA and is copied back to the host for verfication; else a random field (which is also used for the inverse)is constructed on the host and downloaded to the device. Default is the latter as per previous behaviour. --- tests/dslash_test.cpp | 22 ++++++++++++---------- tests/test_util.cpp | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 691f1794b4..60f805850d 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -69,6 +69,8 @@ extern QudaPrecision prec; extern QudaDagType dagger; QudaDagType not_dagger; +extern bool compute_clover; + extern bool verify_results; extern int niter; extern char latfile[]; @@ -275,15 +277,11 @@ void init(int argc, char **argv) { csParam.precision = inv_param.cpu_prec; csParam.pad = 0; - if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || - dslash_type == QUDA_MOBIUS_DWF_DSLASH) - { + if(dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; - - } else - { - if (test_type < 2 || test_type ==3) { + } else { + if (test_type < 2 || test_type == 3) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { @@ -317,6 +315,7 @@ void init(int argc, char **argv) { double norm = 0.1; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); + memcpy(hostCloverInv, hostClover, V*cloverSiteSize*inv_param.clover_cpu_prec); printfQuda("done.\n"); fflush(stdout); @@ -335,9 +334,12 @@ void init(int argc, char **argv) { loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - printfQuda("Sending clover field to GPU\n"); - inv_param.compute_clover_inverse = 1; - inv_param.return_clover_inverse = 1; + if (compute_clover) printfQuda("Computing clover field on GPU\n"); + else printfQuda("Sending clover field to GPU\n"); + inv_param.compute_clover = compute_clover; + inv_param.return_clover = compute_clover; + inv_param.compute_clover_inverse = compute_clover; + inv_param.return_clover_inverse = compute_clover; loadCloverQuda(hostClover, hostCloverInv, &inv_param); } diff --git a/tests/test_util.cpp b/tests/test_util.cpp index 38c5e3cda5..c484e7ec64 100644 --- a/tests/test_util.cpp +++ b/tests/test_util.cpp @@ -1590,6 +1590,8 @@ bool verify_results = true; double mass = 0.1; double mu = 0.1; double anisotropy = 1.0; +double clover_coeff = 1.0; +bool compute_clover = false; double tol = 1e-7; double tol_hq = 0.; QudaTwistFlavorType twist_flavor = QUDA_TWIST_MINUS; @@ -1656,6 +1658,8 @@ void usage(char** argv ) printf(" --multishift # Whether to do a multi-shift solver test or not (default false)\n"); printf(" --mass # Mass of Dirac operator (default 0.1)\n"); printf(" --mu # Twisted-Mass of Dirac operator (default 0.1)\n"); + printf(" --compute-clover # Compute the clover field or use random numbers (default false)\n"); + printf(" --clover-coeff # Clover coefficient (default 1.0)\n"); printf(" --anisotropy # Temporal anisotropy factor (default 1.0)\n"); printf(" --mass-normalization # Mass normalization (kappa (default) / mass / asym-mass)\n"); printf(" --matpc # Matrix preconditioning type (even-even, odd-odd, even-even-asym, odd-odd-asym) \n"); @@ -2150,6 +2154,34 @@ int process_command_line_option(int argc, char** argv, int* idx) goto out; } + if( strcmp(argv[i], "--compute-clover") == 0){ + if (i+1 >= argc){ + usage(argv); + } + if (strcmp(argv[i+1], "true") == 0){ + compute_clover = true; + }else if (strcmp(argv[i+1], "false") == 0){ + compute_clover = false; + }else{ + fprintf(stderr, "ERROR: invalid compute_clover type\n"); + exit(1); + } + + i++; + ret = 0; + goto out; + } + + if( strcmp(argv[i], "--clover-coeff") == 0){ + if (i+1 >= argc){ + usage(argv); + } + clover_coeff = atof(argv[i+1]); + i++; + ret = 0; + goto out; + } + if( strcmp(argv[i], "--mu") == 0){ if (i+1 >= argc){ usage(argv); From 807a2c046c9db65bc539fadab77ab5b200cfd5bb Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 20:06:18 -0700 Subject: [PATCH 20/24] invert_test now supports --clover-coeff and --compute-clover command line parameters: if the former is set to true then the true clover field is computed using the set clover coefficient. Enabled multi-shift solver testing for twisted-clover and fixed memory leaks in invert_test. --- tests/invert_test.cpp | 100 ++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 38 deletions(-) diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 962af569cf..2eabfe7587 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -57,6 +57,9 @@ extern double tol_hq; // heavy-quark tolerance for inverter extern QudaMassNormalization normalization; // mass normalization of Dirac operators extern QudaMatPCType matpc_type; // preconditioning type +extern double clover_coeff; +extern bool compute_clover; + extern int niter; // max solver iterations extern char latfile[]; @@ -184,12 +187,14 @@ int main(int argc, char **argv) for (int i=0; iVolume(); void *evenOut = spinorCheck; void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset); - + void *evenIn = spinorOut; void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset); - + tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param); } + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + tmc_mat(spinorCheck, gauge, clover, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, + inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { @@ -506,11 +511,16 @@ int main(int argc, char **argv) } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) + errorQuda("Twisted mass solution type not supported"); + tmc_matpc(spinorCheck, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); @@ -554,13 +564,20 @@ int main(int argc, char **argv) ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); + } else if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) + errorQuda("Twisted mass solution type not supported"); + tmc_matpc(spinorTmp, gauge, spinorOut, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); + tmc_matpc(spinorCheck, gauge, spinorTmp, clover, clover_inv, inv_param.kappa, inv_param.mu, + inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); @@ -602,5 +619,12 @@ int main(int argc, char **argv) finalizeComms(); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (clover) free(clover); + if (clover_inv) free(clover_inv); + } + + for (int dir = 0; dir<4; dir++) free(gauge[dir]); + return 0; } From 86034f9edb02991715729cdf177bccc5409a90b4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 20:09:00 -0700 Subject: [PATCH 21/24] Added missing --clover-coeff support to dslash_test --- tests/dslash_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 60f805850d..14ac029b8f 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -70,6 +70,7 @@ extern QudaDagType dagger; QudaDagType not_dagger; extern bool compute_clover; +extern double clover_coeff; extern bool verify_results; extern int niter; @@ -239,7 +240,7 @@ void init(int argc, char **argv) { inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; - inv_param.clover_coeff = 1.5*inv_param.kappa; + inv_param.clover_coeff = clover_coeff; hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); } From d011bf2dbd9242a27722a04f678e8b5f94fad575 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 22:28:57 -0700 Subject: [PATCH 22/24] Added support for new clover testing in multigrid_invert_test --- tests/multigrid_invert_test.cpp | 53 ++++++++++++++------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/tests/multigrid_invert_test.cpp b/tests/multigrid_invert_test.cpp index a3b88d4c11..8b842ec10f 100644 --- a/tests/multigrid_invert_test.cpp +++ b/tests/multigrid_invert_test.cpp @@ -72,7 +72,8 @@ extern QudaTwistFlavorType twist_flavor; extern void usage(char** ); -double clover_coeff = 1.0; +extern double clover_coeff; +extern bool compute_clover; namespace quda { extern void setTransferGPU(bool); @@ -171,6 +172,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) { inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.clover_coeff = clover_coeff; } inv_param.input_location = QUDA_CPU_FIELD_LOCATION; @@ -195,8 +197,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) { } } - inv_param.clover_coeff = clover_coeff; - inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; @@ -415,7 +415,7 @@ int main(int argc, char **argv) size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); - void *gauge[4], *clover_inv=0;//, *clover=0; + void *gauge[4], *clover=0, *clover_inv=0; for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*gSize); @@ -436,25 +436,15 @@ int main(int argc, char **argv) double norm = 0.1; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal - size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); + size_t cSize = inv_param.clover_cpu_prec; + clover = malloc(V*cloverSiteSize*cSize); clover_inv = malloc(V*cloverSiteSize*cSize); - construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec); - - // The uninverted clover term is only needed when solving the unpreconditioned - // system or when using "asymmetric" even/odd preconditioning. - int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE || - inv_param.solve_type == QUDA_NORMOP_PC_SOLVE); - int asymmetric = preconditioned && - (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || - inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); - if (!preconditioned) { - //clover = clover_inv; - clover_inv = NULL; - } else if (asymmetric) { // fake it by using the same random matrix - //clover = clover_inv; // for both clover and clover_inv - } else { - //clover = NULL; - } + if (!compute_clover) construct_clover_field(clover, norm, diag, inv_param.clover_cpu_prec); + + inv_param.compute_clover = compute_clover; + if (compute_clover) inv_param.return_clover = 1; + inv_param.compute_clover_inverse = 1; + inv_param.return_clover_inverse = 1; } void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls); @@ -472,12 +462,10 @@ int main(int argc, char **argv) // load the gauge field loadGaugeQuda((void*)gauge, &gauge_param); - // load the clover term, if desired - //if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); - // this line ensure that if we need to construct the clover inverse (in either the smoother or the solver) we do so if (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE || solve_type == QUDA_DIRECT_PC_SOLVE) inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); + inv_param.solve_type = solve_type; // restore actual solve_type we want to do // setup the multigrid solver @@ -577,11 +565,14 @@ int main(int argc, char **argv) endQuda(); // finalize the communications layer -#if defined(QMP_COMMS) - QMP_finalize_msg_passing(); -#elif defined(MPI_COMMS) - MPI_Finalize(); -#endif + finalizeComms(); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + if (clover) free(clover); + if (clover_inv) free(clover_inv); + } + + for (int dir = 0; dir<4; dir++) free(gauge[dir]); return 0; } From 38fda505cc1aec25c9c351ad8a093a5c2fcf8f0b Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 20 Jun 2016 23:17:23 -0700 Subject: [PATCH 23/24] Added chiral-block-level accessor for clover::FloatNOrder to try to give the compiler an easier time with register allocation. Applied this to clover creation and inversion. --- include/clover_field_order.h | 97 +++++++++++++++++++++++------------- lib/clover_invert.cu | 37 ++++---------- lib/clover_quda.cu | 14 +++--- tests/test_util.cpp | 2 +- 4 files changed, 79 insertions(+), 71 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index b890d475e4..ceb08ad5c2 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -253,6 +253,8 @@ namespace quda { struct FloatNOrder { typedef typename mapper::type RegType; typedef typename VectorType::type Vector; + static const int M=length/(N*2); // number of short vectors per chiral block + static const int block=length/2; // chiral block size Float *clover; float *norm; @@ -282,59 +284,84 @@ namespace quda { bool Twisted() const {return twisted;} Float Mu2() const {return mu2;} - __device__ __host__ inline void load(RegType v[length], int x, int parity) const { - const int M = length/(N*2); + /** + @brief Load accessor for a single chiral block + @param[out] v Vector of loaded elements + @param[in] x Checkerboarded site index + @param[in] parity Field parity + @param[in] chirality Chiral block index + */ + __device__ __host__ inline void load(RegType v[block], int x, int parity, int chirality) const { #pragma unroll - for (int chirality=0; chirality<2; chirality++) { + for (int i=0; i(clover + parity*offset, x + stride*(chirality*M+i)); + // second do scalar copy converting into register type #pragma unroll - for (int i=0; i(clover + parity*offset, x + stride*(chirality*M+i)); - // second do scalar copy converting into register type - for (int j=0; j(&vecTmp)[j]); - } + for (int j=0; j(&vecTmp)[j]); + } - if (sizeof(Float)==sizeof(short)) + if (sizeof(Float)==sizeof(short)) #pragma unroll - for (int i=0; i scale[chi] ? fabs(v[chi*M+i]) : scale[chi]; - norm[parity*norm_offset + chi*stride + x] = scale[chi]; - } + for (int i=0; i scale ? fabs(v[i]) : scale; + norm[parity*norm_offset + chirality*stride + x] = scale; } - const int M=length/(N*2); - for (int chirality=0; chirality<2; chirality++) { - #pragma unroll - for (int i=0; i(&vecTmp)[j], v[(chirality*M+i)*N+j] / scale[chirality]); - else + for (int j=0; j(&vecTmp)[j], v[i*N+j] / scale); + else #pragma unroll - for (int j=0; j(&vecTmp)[j], v[(chirality*M+i)*N+j]); + for (int j=0; j(&vecTmp)[j], v[i*N+j]); - // second do vectorized copy into memory - reinterpret_cast(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp; - } + // second do vectorized copy into memory + reinterpret_cast(clover + parity*offset)[x + stride*(chirality*M+i)] = vecTmp; } } + /** + @brief Load accessor for the clover matrix + @param[out] v Vector of loaded elements + @param[in] x Checkerboarded site index + @param[in] parity Field parity + @param[in] chirality Chiral block index + */ + __device__ __host__ inline void load(RegType v[length], int x, int parity) const { +#pragma unroll + for (int chirality=0; chirality<2; chirality++) load(&v[chirality*36], x, parity, chirality); + } + + /** + @brief Store accessor for the clover matrix + @param[out] v Vector of elements to be stored + @param[in] x Checkerboarded site index + @param[in] parity Field parity + @param[in] chirality Chiral block index + */ + __device__ __host__ inline void save(const RegType v[length], int x, int parity) { +#pragma unroll + for (int chirality=0; chirality<2; chirality++) save(&v[chirality*36], x, parity, chirality); + } + /** @brief Backup the field to the host when tuning */ diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index 1bfca4dd79..0142c493af 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -32,13 +32,12 @@ namespace quda { template __device__ __host__ inline double cloverInvertCompute(CloverInvertArg arg, int x, int parity) { - Float A[72]; - double trlogA = 0.0; - - // load the clover term into memory - arg.clover.load(A, x, parity); + double trlogA = 0.0; for (int ch=0; ch<2; ch++) { + Float A[36]; + // load the clover term into memory + arg.clover.load(A, x, parity, ch); Float diag[6]; Float tmp[6]; // temporary storage @@ -51,6 +50,7 @@ namespace quda { for (int i=0; i<6; i++) diag[i] = two*A[ch*36+i]; const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; +#pragma unroll for (int i=0; i<15; i++) tri[idtab[i]] = complex(two*A[ch*36+6+2*i], two*A[ch*36+6+2*i+1]); //Compute (T^2 + mu2) first, then invert (not optimized!): @@ -60,38 +60,22 @@ namespace quda { //the global buffer): //complex* aux = (complex*)&A[ch*36]; //compute off-diagonal terms: -// aux[ 0] = tri[0]*diag[0]+diag[1]*tri[0]+conj(tri[2])*tri[1]+conj(tri[4])*tri[3]+conj(tri[7])*tri[6]+conj(tri[11])*tri[10]; -// aux[ 1] = tri[1]*diag[0]+diag[2]*tri[1]+tri[2]*tri[0]+conj(tri[5])*tri[3]+conj(tri[8])*tri[6]+conj(tri[12])*tri[10]; - aux[ 2] = tri[2]*diag[1]+diag[2]*tri[2]+tri[1]*conj(tri[0])+conj(tri[5])*tri[4]+conj(tri[8])*tri[7]+conj(tri[12])*tri[11]; -// aux[ 3] = tri[3]*diag[0]+diag[3]*tri[3]+tri[4]*tri[0]+tri[5]*tri[1]+conj(tri[9])*tri[6]+conj(tri[13])*tri[10]; - aux[ 4] = tri[4]*diag[1]+diag[3]*tri[4]+tri[3]*conj(tri[0])+tri[5]*tri[2]+conj(tri[9])*tri[7]+conj(tri[13])*tri[11]; - aux[ 5] = tri[5]*diag[2]+diag[3]*tri[5]+tri[3]*conj(tri[1])+tri[4]*conj(tri[2])+conj(tri[9])*tri[8]+conj(tri[13])*tri[12]; -// aux[ 6] = tri[6]*diag[0]+diag[4]*tri[6]+tri[7]*tri[0]+tri[8]*tri[1]+tri[9]*tri[3]+conj(tri[14])*tri[10]; - aux[ 7] = tri[7]*diag[1]+diag[4]*tri[7]+tri[6]*conj(tri[0])+tri[8]*tri[2]+tri[9]*tri[4]+conj(tri[14])*tri[11]; - aux[ 8] = tri[8]*diag[2]+diag[4]*tri[8]+tri[6]*conj(tri[1])+tri[7]*conj(tri[2])+tri[9]*tri[5]+conj(tri[14])*tri[12]; - aux[ 9] = tri[9]*diag[3]+diag[4]*tri[9]+tri[6]*conj(tri[3])+tri[7]*conj(tri[4])+tri[8]*conj(tri[5])+conj(tri[14])*tri[13]; -// aux[10] = tri[10]*diag[0]+diag[5]*tri[10]+tri[11]*tri[0]+tri[12]*tri[1]+tri[13]*tri[3]+tri[14]*tri[6]; - aux[11] = tri[11]*diag[1]+diag[5]*tri[11]+tri[10]*conj(tri[0])+tri[12]*tri[2]+tri[13]*tri[4]+tri[14]*tri[7]; - aux[12] = tri[12]*diag[2]+diag[5]*tri[12]+tri[10]*conj(tri[1])+tri[11]*conj(tri[2])+tri[13]*tri[5]+tri[14]*tri[8]; - aux[13] = tri[13]*diag[3]+diag[5]*tri[13]+tri[10]*conj(tri[3])+tri[11]*conj(tri[4])+tri[12]*conj(tri[5])+tri[14]*tri[9]; - aux[14] = tri[14]*diag[4]+diag[5]*tri[14]+tri[10]*conj(tri[6])+tri[11]*conj(tri[7])+tri[12]*conj(tri[8])+tri[13]*conj(tri[9]); - //update diagonal elements: diag[0] = (Float)arg.mu2+diag[0]*diag[0]+norm(tri[ 0])+norm(tri[ 1])+norm(tri[ 3])+norm(tri[ 6])+norm(tri[10]); diag[1] = (Float)arg.mu2+diag[1]*diag[1]+norm(tri[ 0])+norm(tri[ 2])+norm(tri[ 4])+norm(tri[ 7])+norm(tri[11]); @@ -164,13 +148,12 @@ namespace quda { constexpr Float half = static_cast(0.5); for (int i=0; i<6; i++) A[ch*36+i] = half * diag[i]; - for (int i=0; i<15; i++) { - A[ch*36+6+2*i] = half*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = half*tri[idtab[i]].imag(); - } - } +#pragma unroll + for (int i=0; i<15; i++) { A[ch*36+6+2*i] = half*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = half*tri[idtab[i]].imag(); } - // save the inverted matrix - arg.inverse.save(A, x, parity); + // save the inverted matrix + arg.inverse.save(A, x, parity, ch); + } return trlogA; } diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu index 17fb161dec..4814d057f1 100644 --- a/lib/clover_quda.cu +++ b/lib/clover_quda.cu @@ -95,7 +95,6 @@ namespace quda { const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; Float diag[6]; Complex triangle[15]; - Float A[72]; // This uses lots of unnecessary memory for(int ch=0; ch<2; ++ch){ @@ -129,18 +128,17 @@ namespace quda { triangle[14] = block1[ch](2,1); - for(int i=0; i<6; ++i){ - A[ch*36 + i] = 0.5*diag[i]; - } + Float A[36]; + for(int i=0; i<6; ++i) A[i] = static_cast(0.5)*diag[i]; + for(int i=0; i<15; ++i){ - A[ch*36+6+2*i] = 0.5*triangle[idtab[i]].x; - A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y; + A[6+2*i] = 0.5*triangle[idtab[i]].x; + A[6+2*i + 1] = 0.5*triangle[idtab[i]].y; } + arg.clover.save(A, idx, parity, ch); } // ch // 84 floating-point ops - - arg.clover.save(A, idx, parity); return; } diff --git a/tests/test_util.cpp b/tests/test_util.cpp index c484e7ec64..44982bef69 100644 --- a/tests/test_util.cpp +++ b/tests/test_util.cpp @@ -1590,7 +1590,7 @@ bool verify_results = true; double mass = 0.1; double mu = 0.1; double anisotropy = 1.0; -double clover_coeff = 1.0; +double clover_coeff = 0.1; bool compute_clover = false; double tol = 1e-7; double tol_hq = 0.; From 889939007528ec7d60fdbe2414e077b6becfb468 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 23 Jun 2016 13:56:09 -0700 Subject: [PATCH 24/24] Added missing guard around clover construction --- tests/dslash_test.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 14ac029b8f..63638913e9 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -313,10 +313,12 @@ void init(int argc, char **argv) { spinor->Source(QUDA_RANDOM_SOURCE, 0); - double norm = 0.1; // clover components are random numbers in the range (-norm, norm) - double diag = 1.0; // constant added to the diagonal - construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); - memcpy(hostCloverInv, hostClover, V*cloverSiteSize*inv_param.clover_cpu_prec); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + double norm = 0.1; // clover components are random numbers in the range (-norm, norm) + double diag = 1.0; // constant added to the diagonal + construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); + memcpy(hostCloverInv, hostClover, V*cloverSiteSize*inv_param.clover_cpu_prec); + } printfQuda("done.\n"); fflush(stdout);