diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 3c5ccf194e..4801bdbf48 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -426,6 +426,12 @@ namespace quda */ ColorSpinorField &operator=(ColorSpinorField &&field); + /** + @brief Returns if the object is empty (not initialized) + @return true if the object has not been allocated, otherwise false + */ + bool empty() const { return !init; } + /** @brief Copy the source field contents into this @param[in] src Source from which we are copying diff --git a/include/gauge_field.h b/include/gauge_field.h index b1757087cb..948960b36f 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -476,6 +476,17 @@ namespace quda { return reinterpret_cast(gauge_array[d].data()); } + void *raw_pointer() const + { + if (is_pointer_array(order)) { + static void *data_array[8]; + for (int i = 0; i < site_dim; i++) data_array[i] = gauge_array[i].data(); + return data_array; + } else { + return gauge.data(); + } + } + /** @brief Return array of pointers to the per dimension gauge field allocation(s). @tparam T Optional type to cast the pointer to (default is diff --git a/include/quda_arch.h b/include/quda_arch.h index ed88fb0b8e..45a8ed34e4 100644 --- a/include/quda_arch.h +++ b/include/quda_arch.h @@ -14,5 +14,8 @@ #elif defined(QUDA_TARGET_SYCL) #include +#endif +#ifdef QUDA_OPENMP +#include #endif diff --git a/include/util_quda.h b/include/util_quda.h index 533df01970..3d68fb5a2e 100644 --- a/include/util_quda.h +++ b/include/util_quda.h @@ -66,7 +66,7 @@ char *getPrintBuffer(); number of OMP threads for CPU functions recorded in the tune cache. @return Returns the string */ -char* getOmpThreadStr(); +const char *getOmpThreadStr(); void errorQuda_(const char *func, const char *file, int line, ...); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index d02a26370d..6c18ccae43 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -470,6 +470,7 @@ endif() if(QUDA_OPENMP) target_link_libraries(quda PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(quda PUBLIC QUDA_OPENMP) endif() # set which precisions to enable diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index e949080b79..46c190f433 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -75,7 +75,7 @@ namespace quda { if (&src != this) { // if field not already initialized then move the field - if (!init || are_compatible(*this, src)) { + if (!init || are_compatible(*this, src) || src.empty()) { if (init) destroy(); LatticeField::operator=(std::move(src)); move(std::move(src)); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 7117677300..c9576f6f01 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3219,11 +3219,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // the split topology. logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n"); if (!is_staggered) { - loadGaugeQuda(collected_gauge->data(), gauge_param); + loadGaugeQuda(collected_gauge->raw_pointer(), gauge_param); } else { - // freeGaugeQuda(); - loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(), - collected_milc_longlink_field->data()); + loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->raw_pointer(), + collected_milc_longlink_field->raw_pointer()); } logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n"); diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp index 02a78834d0..93b9fb4627 100644 --- a/lib/util_quda.cpp +++ b/lib/util_quda.cpp @@ -134,17 +134,17 @@ void popOutputPrefix() char *getPrintBuffer() { return buffer_; } -char* getOmpThreadStr() { - static char omp_thread_string[128]; +const char *getOmpThreadStr() +{ + static std::string omp_thread_string; static bool init = false; if (!init) { - strcpy(omp_thread_string,"omp_threads="); - char *omp_threads = getenv("OMP_NUM_THREADS"); - strcat(omp_thread_string, omp_threads ? omp_threads : "1"); - strcat(omp_thread_string, ","); +#ifdef QUDA_OPENMP + omp_thread_string = std::string("omp_threads=" + std::to_string(omp_get_max_threads()) + ","); +#endif init = true; } - return omp_thread_string; + return omp_thread_string.c_str(); } void errorQuda_(const char *func, const char *file, int line, ...) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a1f25d4faf..3a95e355e8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,6 +9,15 @@ target_compile_options( $,-w,-Wall -Wextra> $<$:-Werror> ) + +# ignore any unkown pragmas if not using OpenMP +if(NOT ${QUDA_OPENMP}) + target_compile_options(quda_test PUBLIC $<$: + $<$:-Wno-unknown-pragmas> + $<$:-Wno-unknown-pragmas> + >) +endif() + if(BUILD_SHARED_LIBS) install(TARGETS quda_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() @@ -265,6 +274,7 @@ if(QUDA_MPI OR QUDA_QMP) if(DEFINED ENV{QUDA_TEST_GRID_SIZE}) get_test_ranks($ENV{QUDA_TEST_GRID_SIZE} QUDA_TEST_NUM_PROCS) endif() + message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes") set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS} CACHE STRING "CTest Launcher command for QUDA's tests") endif() @@ -375,6 +385,18 @@ foreach(pol IN LISTS DSLASH_POLICIES) set_tests_properties(dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol}) endif() + add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type ${DIRAC_NAME} + --all-partitions 0 + --test Dslash + --dim 2 4 6 8 + --gtest_output=xml:dslash_${DIRAC_NAME}_splitgrid_test_pol${pol2}.xml) + if(polenv) + set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol}) + endif() + set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type ${DIRAC_NAME} @@ -830,6 +852,17 @@ endif() set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2}) endif() + add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type ${DIRAC_NAME} + --test MatPC + --dim 2 4 6 8 + --gtest_output=xml:dslash_${DIRAC_NAME}_matpc_test_pol${pol2}.xml) + if(polenv) + set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2}) + endif() + set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dslash-type ${DIRAC_NAME} @@ -931,6 +964,20 @@ foreach(prec IN LISTS TEST_PRECS) --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --enable-testing true --gtest_output=xml:invert_test_wilson_${prec}.xml) + + if(DEFINED ENV{QUDA_ENABLE_TUNING}) + if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) + add_test(NAME invert_test_splitgrid_wilson_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --ngcrkrylov 8 + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + endif() + endif() endif() if(QUDA_DIRAC_TWISTED_MASS) diff --git a/tests/dslash_ctest.cpp b/tests/dslash_ctest.cpp index d75d1f3da9..631d640203 100644 --- a/tests/dslash_ctest.cpp +++ b/tests/dslash_ctest.cpp @@ -37,10 +37,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple 1); - if (::testing::get<2>(GetParam()) > 0 && test_split_grid) { return true; } - const std::array partition_enabled {true, true, true, false, true, false, false, false, true, false, false, false, true, false, true, true}; if (!ctest_all_partitions && !partition_enabled[::testing::get<2>(GetParam())]) return true; @@ -68,8 +64,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple(GetParam()); @@ -94,12 +88,20 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple vp_spinor; - std::vector vp_spinorOut; - std::vector vp_spinorRef; + static inline std::vector vp_spinor; + static inline std::vector vp_spinorOut; + static inline std::vector vp_spinorRef; // CUDA color spinor fields ColorSpinorField cudaSpinor; @@ -69,9 +69,9 @@ struct DslashTestWrapper { quda::DiracDomainWall4DPC *dirac_4dpc = nullptr; // Raw pointers - void *hostGauge[4] = {nullptr}; - void *hostClover = nullptr; - void *hostCloverInv = nullptr; + static inline void *hostGauge[4] = {nullptr}; + static inline void *hostClover = nullptr; + static inline void *hostCloverInv = nullptr; // Parameters QudaGaugeParam gauge_param; @@ -79,14 +79,12 @@ struct DslashTestWrapper { // Test options QudaParity parity = QUDA_EVEN_PARITY; - dslash_test_type dtest_type = dslash_test_type::Dslash; - bool test_split_grid = false; + static inline dslash_test_type dtest_type = dslash_test_type::Dslash; + static inline bool test_split_grid = false; int num_src = 1; const bool transfer = false; - DslashTestWrapper(dslash_test_type dtest) : dtest_type(dtest) { } - void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon) { cuda_prec = getPrecision(precision); @@ -108,7 +106,12 @@ struct DslashTestWrapper { inv_param.cuda_prec = cuda_prec; - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } void init_test(int argc, char **argv) @@ -118,15 +121,16 @@ struct DslashTestWrapper { setWilsonGaugeParam(gauge_param); setInvertParam(inv_param); - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } - void init(int argc, char **argv) + void init_host(int argc, char **argv) { - num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; - test_split_grid = num_src > 1; - if (test_split_grid) { dtest_type = dslash_test_type::Dslash; } - if (dslash_type == QUDA_ASQTAD_DSLASH || dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { errorQuda("Asqtad not supported. Please try staggered_dslash_test instead"); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH @@ -138,6 +142,13 @@ struct DslashTestWrapper { Ls = 1; } + if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match"); + + for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; + num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; + test_split_grid = num_src > 1; + if (test_split_grid) { dtest_type = dslash_test_type::Dslash; } + inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO; inv_param.solve_type = (dtest_type == dslash_test_type::Mat || dtest_type == dslash_test_type::MatDagMat) ? QUDA_DIRECT_SOLVE : @@ -178,8 +189,6 @@ struct DslashTestWrapper { } } - if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match"); - // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = safe_malloc((size_t)V * gauge_site_size * gauge_param.cpu_prec); @@ -187,8 +196,18 @@ struct DslashTestWrapper { || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { hostClover = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); hostCloverInv = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); + + if (compute_clover) + printfQuda("Computing clover field on GPU\n"); + else { + printfQuda("Sending clover field to GPU\n"); + constructHostCloverField(hostClover, hostCloverInv, inv_param); + } } + printfQuda("Randomizing fields... "); + constructHostGaugeField(hostGauge, gauge_param, argc, argv); + ColorSpinorParam csParam; csParam.nColor = 3; csParam.nSpin = 4; @@ -200,11 +219,7 @@ struct DslashTestWrapper { csParam.x[4] = Ls; } - if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { - csParam.pc_type = QUDA_5D_PC; - } else { - csParam.pc_type = QUDA_4D_PC; - } + csParam.pc_type = dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC; // ndeg_tm if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { @@ -236,11 +251,6 @@ struct DslashTestWrapper { spinor.Source(QUDA_RANDOM_SOURCE); - inv_param.split_grid[0] = grid_partition[0]; - inv_param.split_grid[1] = grid_partition[1]; - inv_param.split_grid[2] = grid_partition[2]; - inv_param.split_grid[3] = grid_partition[3]; - if (test_split_grid) { inv_param.num_src = num_src; inv_param.num_src_per_sub_partition = 1; @@ -251,26 +261,18 @@ struct DslashTestWrapper { std::fill(vp_spinor.begin(), vp_spinor.end(), spinor); } - csParam.x[0] = gauge_param.X[0]; - // set verbosity prior to loadGaugeQuda setVerbosity(verbosity); inv_param.verbosity = verbosity; + } - printfQuda("Randomizing fields... "); - constructHostGaugeField(hostGauge, gauge_param, argc, argv); - + void init() + { printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - if (compute_clover) - printfQuda("Computing clover field on GPU\n"); - else { - printfQuda("Sending clover field to GPU\n"); - constructHostCloverField(hostClover, hostCloverInv, inv_param); - } inv_param.compute_clover = compute_clover; inv_param.return_clover = compute_clover; inv_param.compute_clover_inverse = true; @@ -280,28 +282,16 @@ struct DslashTestWrapper { } if (!transfer) { + ColorSpinorParam csParam(spinor); csParam.location = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true); - if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - csParam.siteSubset = QUDA_FULL_SITE_SUBSET; - } else { - csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; - csParam.x[0] /= 2; - } - printfQuda("Creating cudaSpinor with nParity = %d\n", csParam.siteSubset); cudaSpinor = ColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset); cudaSpinorOut = ColorSpinorField(csParam); - if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - csParam.x[0] /= 2; - } - - csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; - printfQuda("Sending spinor field to GPU\n"); cudaSpinor = spinor; @@ -330,12 +320,28 @@ struct DslashTestWrapper { dirac = nullptr; } } + } + + static void destroy() + { + for (int dir = 0; dir < 4; dir++) + if (hostGauge[dir]) host_free(hostGauge[dir]); - for (int dir = 0; dir < 4; dir++) host_free(hostGauge[dir]); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - host_free(hostClover); - host_free(hostCloverInv); + if (hostClover) host_free(hostClover); + if (hostCloverInv) host_free(hostCloverInv); + } + + spinor = {}; + spinorOut = {}; + spinorRef = {}; + spinorTmp = {}; + + if (test_split_grid) { + vp_spinor.clear(); + vp_spinorOut.clear(); + vp_spinorRef.clear(); } } diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index f15bcdbda5..370470fc50 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -204,10 +204,14 @@ void gauge_force_test(bool compute_force = true) void *refmom = Mom_ref_milc.data(); int *check_out = compute_force ? &force_check : &path_check; if (verify_results) { + quda::host_timer_t verify_timer; + verify_timer.start(); gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length, loop_coeff, num_paths, compute_force); *check_out = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec); if (compute_force) strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec); + verify_timer.stop(); + printfQuda("Verification time = %.2f ms\n", verify_timer.last()); } if (compute_force) { @@ -317,6 +321,9 @@ void gauge_loop_test() std::vector traces_ref(num_paths); if (verify_results) { + quda::host_timer_t verify_timer; + verify_timer.start(); + gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p, trace_loop_length_p, trace_loop_coeff_p, num_paths); @@ -348,6 +355,9 @@ void gauge_loop_test() "Plaquette loop space %e time %e total %e ; plaqQuda space %e time %e total %e ; deviation %e\n", plaq_loop[0], plaq_loop[1], plaq_loop[2], obsParam.plaquette[0], obsParam.plaquette[1], obsParam.plaquette[2], plaq_deviation); + + verify_timer.stop(); + printfQuda("Verification time = %.2f ms\n", verify_timer.last()); } double perf = 1.0 * niter * flops * V / (host_timer.last() * 1e+9); diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index cb6572b25b..98c2ae91d3 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -104,8 +104,10 @@ static void hisq_test() double u4 = u2 * u2; double u6 = u4 * u2; + std::array, 3> act_paths; + // First path: create V, W links - double act_path_coeff_1[6] = { + act_paths[0] = { (1.0 / 8.0), /* one link */ u2 * (0.0), /* Naik */ u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ @@ -115,7 +117,7 @@ static void hisq_test() }; // Second path: create X, long links - double act_path_coeff_2[6] = { + act_paths[1] = { ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ (-1.0 / 24.0), /* Naik */ @@ -126,7 +128,7 @@ static void hisq_test() }; // Paths for epsilon corrections. Not used if n_naiks = 1. - double act_path_coeff_3[6] = { + act_paths[2] = { (1.0 / 8.0), /* one link b/c of Naik */ (-1.0 / 24.0), /* Naik */ 0.0, /* simple staple */ @@ -185,7 +187,7 @@ static void hisq_test() // Tuning run... { printfQuda("Tuning...\n"); - computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam); + computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam); } struct timeval t0, t1; @@ -196,11 +198,11 @@ static void hisq_test() // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam); + computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam); if (n_naiks > 1) { // Create Naiks, 3rd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam); // Rescale+copy Naiks into Naik field cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); @@ -211,7 +213,7 @@ static void hisq_test() } // Create X and long links, 2nd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam); if (n_naiks > 1) { // Add into Naik field @@ -244,9 +246,6 @@ static void hisq_test() } if (verify_results) { - - double *act_paths[3] = {act_path_coeff_1, act_path_coeff_2, act_path_coeff_3}; - computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam, act_paths, eps_naik); } diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index c43a7ca07e..0f27fa9572 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -69,8 +69,16 @@ struct fcomplex { struct dcomplex { double real; double imag; + + void operator+=(const dcomplex &other) + { + real += other.real; + imag += other.imag; + } }; +#pragma omp declare reduction(dcomplex_sum:dcomplex : omp_out += omp_in) + struct fsu3_matrix { using real_t = float; using complex_t = fcomplex; @@ -322,9 +330,7 @@ int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat) template static su3_matrix compute_gauge_path(su3_matrix **sitelink, int i, int *path, int len, int dx[4], const lattice_t &lat) { - su3_matrix prev_matrix, curr_matrix; - - memset(&curr_matrix, 0, sizeof(curr_matrix)); + su3_matrix prev_matrix, curr_matrix = {}; curr_matrix.e[0][0].real = 1; curr_matrix.e[1][1].real = 1; @@ -366,16 +372,14 @@ template static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int *path, int len, Float loop_coeff, int dir, const lattice_t &lat) { - su3_matrix curr_matrix, tmat; - int dx[4]; - +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { - memset(dx, 0, sizeof(dx)); - + int dx[4] = {}; dx[dir] = 1; - curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat); + su3_matrix curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat); + su3_matrix tmat; su3_adjoint(&curr_matrix, &tmat); scalar_mult_add_su3_matrix(staple + i, &tmat, loop_coeff, staple + i); } // i @@ -384,16 +388,14 @@ static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int template static dcomplex compute_loop_trace(su3_matrix **sitelink, int *path, int len, double loop_coeff, const lattice_t &lat) { - su3_matrix tmat; - dcomplex accum; - memset(&accum, 0, sizeof(accum)); - int dx[4]; + dcomplex accum = {}; +#pragma omp parallel for reduction(dcomplex_sum : accum) for (size_t i = 0; i < lat.volume; i++) { - memset(dx, 0, sizeof(dx)); - tmat = compute_gauge_path(sitelink, i, path, len, dx, lat); + int dx[4] = {}; + su3_matrix tmat = compute_gauge_path(sitelink, i, path, len, dx, lat); auto tr = trace_su3(&tmat); - CSUM(accum, tr); + accum += dcomplex {tr.real, tr.imag}; } CSCALE(accum, loop_coeff); @@ -405,6 +407,7 @@ template static void update_mom(anti_hermitmat *momentum, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3, const lattice_t &lat) { +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { su3_matrix tmat1; su3_matrix tmat2; @@ -426,6 +429,7 @@ template static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3, const lattice_t &lat) { +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { su3_matrix tmat; diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 5179f38a66..5e85375bb3 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -99,8 +99,9 @@ template void su3_projector(su3_vecto template void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops) { - int dx[4]; +#pragma omp parallel for for (int i = 0; i < V; ++i) { + int dx[4]; for (int dir = 0; dir < 4; ++dir) { dx[3] = dx[2] = dx[1] = dx[0] = 0; dx[dir] = nhops; @@ -894,10 +895,12 @@ void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Rea // To keep the code as close to the GPU code as possible, we'll // loop over the even sites first and then the odd sites LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeMiddleLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod); } // Loop over odd lattice sites +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeMiddleLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod); } @@ -988,10 +991,12 @@ void computeSideLinkField(const int dim[4], const Real *const P3, #endif LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeSideLinkSite(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); } +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeSideLinkSite(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); } @@ -1098,6 +1103,7 @@ void computeAllLinkField(const int dim[4], const Real *const oprod, const Real * #endif LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeAllLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); @@ -1297,10 +1303,12 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real const int half_volume = volume / 2; LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { computeLongLinkSite(site, dim, oprod, link, sig, coeff, ls, output); } // Loop over odd lattice sites +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { computeLongLinkSite(site, dim, oprod, link, sig, coeff, ls, output); } @@ -1363,7 +1371,9 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c const int half_volume = volume / 2; LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { completeForceSite(site, dim, oprod, link, sig, ls, mom); } +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { completeForceSite(site, dim, oprod, link, sig, ls, mom); } } diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 5716ce5de8..14852a9f22 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -42,6 +42,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type) #endif { +#pragma omp parallel for for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0; gFloat *fatlinkEven[4], *fatlinkOdd[4]; @@ -66,6 +67,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, #endif } +#pragma omp parallel for for (int sid = 0; sid < Vh; sid++) { int offset = stag_spinor_site_size * sid; diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 20af8bd390..eda5c778a7 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -20,6 +20,7 @@ QudaMultigridParam mg_param; QudaInvertParam mg_inv_param; QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]; QudaEigParam eig_param; +bool use_split_grid = false; // if --enable-testing true is passed, we run the tests defined in here #include @@ -206,7 +207,7 @@ std::vector> solve(test_t param) // params corresponds to split grid for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; - bool use_split_grid = num_sub_partition > 1; + use_split_grid = num_sub_partition > 1; // Now QUDA is initialised and the fields are loaded, we may setup the preconditioner void *mg_preconditioner = nullptr; diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index 3d16341bcb..74866dbcd7 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -117,6 +117,8 @@ bool skip_test(test_t param) return true; #endif } + // split-grid doesn't support split-grid at present + if (use_split_grid && multishift > 1) return true; return false; } diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 8505b04fe6..78a735ad13 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -62,8 +62,6 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple(GetParam()); @@ -92,7 +90,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple= QUDA_HALF_PRECISION) + tol *= 10; // if recon 8, we tolerate a greater deviation ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree"; } diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index f61e62d88c..0a3a063e45 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -22,8 +22,6 @@ class StaggeredDslashTest : public ::testing::Test } public: - StaggeredDslashTest() = default; - virtual void SetUp() { dslash_test_wrapper.init_test(argc_copy, argv_copy); @@ -37,7 +35,11 @@ class StaggeredDslashTest : public ::testing::Test // Per-test-case tear-down. // Called after the last test in this test case. // Can be omitted if not needed. - static void TearDownTestCase() { endQuda(); } + static void TearDownTestCase() + { + StaggeredDslashTestWrapper::destroy(); + endQuda(); + } }; TEST_F(StaggeredDslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); } diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 6c5d40dd30..a4aaac347b 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -45,7 +45,10 @@ struct DslashTime { struct StaggeredDslashTestWrapper { - void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; + // In the HISQ case, we include building fat/long links in this unit test + static inline void *qdp_fatlink_cpu[4] = {}; + static inline void *qdp_longlink_cpu[4] = {}; QudaGaugeParam gauge_param; QudaInvertParam inv_param; @@ -56,19 +59,16 @@ struct StaggeredDslashTestWrapper { GaugeField *cpuFat = nullptr; GaugeField *cpuLong = nullptr; - ColorSpinorField spinor; - ColorSpinorField spinorOut; - ColorSpinorField spinorRef; - ColorSpinorField tmpCpu; + static inline ColorSpinorField spinor; + static inline ColorSpinorField spinorOut; + static inline ColorSpinorField spinorRef; + static inline ColorSpinorField tmpCpu; ColorSpinorField cudaSpinor; ColorSpinorField cudaSpinorOut; - std::vector vp_spinor; - std::vector vp_spinor_out; + static inline std::vector vp_spinor; + static inline std::vector vp_spinor_out; - // In the HISQ case, we include building fat/long links in this unit test - void *qdp_fatlink_cpu[4] = {}; - void *qdp_longlink_cpu[4] = {}; void *ghost_fatlink_cpu[4] = {}; void *ghost_longlink_cpu[4] = {}; @@ -81,7 +81,7 @@ struct StaggeredDslashTestWrapper { char **argv_copy; // Split grid options - bool test_split_grid = false; + static inline bool test_split_grid = false; int num_src = 1; void staggeredDslashRef() @@ -133,7 +133,12 @@ struct StaggeredDslashTestWrapper { link_recon = link_recon_; - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } void init_test(int argc, char **argv) @@ -144,52 +149,122 @@ struct StaggeredDslashTestWrapper { setStaggeredGaugeParam(gauge_param); setStaggeredInvertParam(inv_param); - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } - void init(int argc, char **argv) + void init_host(int argc, char **argv) { - inv_param.split_grid[0] = grid_partition[0]; - inv_param.split_grid[1] = grid_partition[1]; - inv_param.split_grid[2] = grid_partition[2]; - inv_param.split_grid[3] = grid_partition[3]; + setDims(gauge_param.X); + dw_setDims(gauge_param.X, 1); + if (Nsrc != 1) { + warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc); + Nsrc = 1; + } + for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; test_split_grid = num_src > 1; if (test_split_grid) { dtest_type = dslash_test_type::Dslash; } - inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO; + for (int dir = 0; dir < 4; dir++) { + qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } - setDims(gauge_param.X); - dw_setDims(gauge_param.X, 1); - if (Nsrc != 1) { - warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc); - Nsrc = 1; + bool compute_on_gpu = false; // reference fat/long fields should be computed on cpu + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, + compute_on_gpu); + + ColorSpinorParam csParam; + csParam.nColor = 3; + csParam.nSpin = 1; + csParam.nDim = 4; + for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; } + csParam.x[4] = 1; + + csParam.setPrecision(inv_param.cpu_prec); + // inv_param.solution_type = QUDA_MAT_SOLUTION; + csParam.pad = 0; + if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) { + csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; + csParam.x[0] /= 2; + inv_param.solution_type = QUDA_MATPC_SOLUTION; + } else { + csParam.siteSubset = QUDA_FULL_SITE_SUBSET; + inv_param.solution_type = QUDA_MAT_SOLUTION; } - // Allocate a lot of memory because I'm very confused - void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered + csParam.create = QUDA_ZERO_FIELD_CREATE; + csParam.pc_type = QUDA_4D_PC; + csParam.location = QUDA_CPU_FIELD_LOCATION; - milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + spinor = ColorSpinorField(csParam); + spinorOut = ColorSpinorField(csParam); + spinorRef = ColorSpinorField(csParam); + tmpCpu = ColorSpinorField(csParam); + + spinor.Source(QUDA_RANDOM_SOURCE); + + if (test_split_grid) { + inv_param.num_src = num_src; + inv_param.num_src_per_sub_partition = 1; + resize(vp_spinor, num_src, csParam); + resize(vp_spinor_out, num_src, csParam); + std::fill(vp_spinor.begin(), vp_spinor.end(), spinor); + } + + inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO; + // set verbosity prior to loadGaugeQuda + setVerbosity(verbosity); + } + + void init() + { + // Prepare the fields to be used for the GPU computation void *qdp_fatlink_gpu[4]; void *qdp_longlink_gpu[4]; - for (int dir = 0; dir < 4; dir++) { - qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - - qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you + // "compute" the fat/long links or not. + if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { + for (int dir = 0; dir < 4; dir++) { + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + } + } else { + // QUDA_ASQTAD_DSLASH + if (compute_fatlong) { + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, + n_naiks, eps_naik); + } else { + // Not computing FatLong + for (int dir = 0; dir < 4; dir++) { + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); + } + } } - bool gauge_loaded = false; - constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu, - qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded); + // Create ghost zones for CPU fields, + // prepare and load the GPU fields + void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + + milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // Alright, we've created all the void** links. // Create the void* pointers @@ -197,8 +272,6 @@ struct StaggeredDslashTestWrapper { reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - // Create ghost zones for CPU fields, - // prepare and load the GPU fields #ifdef MULTI_GPU gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS; @@ -227,9 +300,6 @@ struct StaggeredDslashTestWrapper { gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; } - // set verbosity prior to loadGaugeQuda - setVerbosity(verbosity); - printfQuda("Sending fat links to GPU\n"); loadGaugeQuda(milc_fatlink_gpu, &gauge_param); @@ -249,47 +319,7 @@ struct StaggeredDslashTestWrapper { loadGaugeQuda(milc_longlink_gpu, &gauge_param); } - ColorSpinorParam csParam; - csParam.nColor = 3; - csParam.nSpin = 1; - csParam.nDim = 4; - for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; } - csParam.x[4] = 1; - - csParam.setPrecision(inv_param.cpu_prec); - // inv_param.solution_type = QUDA_MAT_SOLUTION; - csParam.pad = 0; - if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) { - csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; - csParam.x[0] /= 2; - inv_param.solution_type = QUDA_MATPC_SOLUTION; - } else { - csParam.siteSubset = QUDA_FULL_SITE_SUBSET; - inv_param.solution_type = QUDA_MAT_SOLUTION; - } - - csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; - csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered - csParam.create = QUDA_ZERO_FIELD_CREATE; - csParam.pc_type = QUDA_4D_PC; - csParam.location = QUDA_CPU_FIELD_LOCATION; - - spinor = ColorSpinorField(csParam); - spinorOut = ColorSpinorField(csParam); - spinorRef = ColorSpinorField(csParam); - tmpCpu = ColorSpinorField(csParam); - - spinor.Source(QUDA_RANDOM_SOURCE); - - if (test_split_grid) { - inv_param.num_src = num_src; - inv_param.num_src_per_sub_partition = 1; - resize(vp_spinor, num_src, csParam); - resize(vp_spinor_out, num_src, csParam); - std::fill(vp_spinor.begin(), vp_spinor.end(), spinor); - } - + ColorSpinorParam csParam(spinor); csParam.fieldOrder = colorspinor::getNative(inv_param.cuda_prec, 1); csParam.pad = 0; csParam.setPrecision(inv_param.cuda_prec); @@ -305,28 +335,23 @@ struct StaggeredDslashTestWrapper { setDiracParam(diracParam, &inv_param, pc); dirac = Dirac::create(diracParam); - for (int dir = 0; dir < 4; dir++) { - host_free(qdp_fatlink_gpu[dir]); - host_free(qdp_longlink_gpu[dir]); - host_free(qdp_inlink[dir]); - } host_free(milc_fatlink_cpu); host_free(milc_longlink_cpu); - } - void end() - { for (int dir = 0; dir < 4; dir++) { - if (qdp_fatlink_cpu[dir] != nullptr) { - host_free(qdp_fatlink_cpu[dir]); - qdp_fatlink_cpu[dir] = nullptr; + if (qdp_fatlink_gpu[dir] != nullptr) { + host_free(qdp_fatlink_gpu[dir]); + qdp_fatlink_gpu[dir] = nullptr; } - if (qdp_longlink_cpu[dir] != nullptr) { - host_free(qdp_longlink_cpu[dir]); - qdp_longlink_cpu[dir] = nullptr; + if (qdp_longlink_gpu[dir] != nullptr) { + host_free(qdp_longlink_gpu[dir]); + qdp_longlink_gpu[dir] = nullptr; } } + } + void end() + { if (dirac != nullptr) { delete dirac; dirac = nullptr; @@ -350,6 +375,25 @@ struct StaggeredDslashTestWrapper { commDimPartitionedReset(); } + static void destroy() + { + for (int dir = 0; dir < 4; dir++) { + if (qdp_inlink[dir]) host_free(qdp_inlink[dir]); + if (qdp_fatlink_cpu[dir]) host_free(qdp_fatlink_cpu[dir]); + if (qdp_longlink_cpu[dir]) host_free(qdp_longlink_cpu[dir]); + } + + spinor = {}; + spinorOut = {}; + spinorRef = {}; + tmpCpu = {}; + + if (test_split_grid) { + vp_spinor.clear(); + vp_spinor_out.clear(); + } + } + DslashTime dslashCUDA(int niter) { DslashTime dslash_time; diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 70877d36d2..a495a11251 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -132,7 +132,7 @@ int main(int argc, char **argv) milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true); // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. double plaq[3]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 5d4a96540c..5fcd338b4b 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -181,7 +181,8 @@ void test(int argc, char **argv) void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)}; - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true); + // Reorder gauge fields to MILC order cpuFatMILC = cpuFatQDP; cpuLongMILC = cpuLongQDP; diff --git a/tests/utils/face_gauge.cpp b/tests/utils/face_gauge.cpp index 4b4af5ca62..bebaae5622 100644 --- a/tests/utils/face_gauge.cpp +++ b/tests/utils/face_gauge.cpp @@ -906,15 +906,10 @@ void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_ Float *ghost_staple_back = ghost_staple[dir]; Float *ghost_staple_fwd = ghost_staple[dir] + 2 * Vsh[dir] * gauge_site_size; - MsgHandle *mh_recv_back; - MsgHandle *mh_recv_fwd; - MsgHandle *mh_send_fwd; - MsgHandle *mh_send_back; - - mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]); - mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]); - mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]); - mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]); + MsgHandle *mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]); + MsgHandle *mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]); + MsgHandle *mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]); + MsgHandle *mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]); comm_start(mh_recv_back); comm_start(mh_recv_fwd); diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 65ea318840..70aea9cdc2 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1054,6 +1054,7 @@ template void constructUnitGaugeField(Float **res, QudaGaugePar } for (int dir = 0; dir < 4; dir++) { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { for (int m = 0; m < 3; m++) { for (int n = 0; n < 3; n++) { @@ -1269,14 +1270,21 @@ template static void checkGauge(Float **oldG, Float **newG, dou for (int d = 0; d < 4; d++) { for (int eo = 0; eo < 2; eo++) { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { int ga_idx = (eo * Vh + i); for (int j = 0; j < 18; j++) { double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]); /// fabs(oldG[d][ga_idx*18+j]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++; - if (diff > epsilon || std::isnan(diff)) iter[d][j]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[d][f]++; + } + if (diff > epsilon || std::isnan(diff)) { +#pragma omp atomic + iter[d][j]++; + } } } } @@ -1311,6 +1319,7 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase) } if (phase == SITELINK_PHASE_MILC) { +#pragma omp parallel for for (int i = 0; i < V; i++) { for (int dir = XUP; dir <= TUP; dir++) { int idx = i; @@ -1459,14 +1468,21 @@ template int compareLink(Float **linkA, Float **linkB, int len) for (int i = 0; i < 18; i++) iter[i] = 0; for (int dir = 0; dir < 4; dir++) { +#pragma omp parallel for for (int i = 0; i < len; i++) { for (int j = 0; j < 18; j++) { int is = i * 18 + j; double diff = fabs(linkA[dir][is] - linkB[dir][is]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[f]++; + } // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); - if (diff > 1e-3 || std::isnan(diff)) iter[j]++; + if (diff > 1e-3 || std::isnan(diff)) { +#pragma omp atomic + iter[j]++; + } } } } @@ -1624,14 +1640,21 @@ template int compare_mom(Float *momA, Float *momB, int len) int iter[mom_site_size]; for (auto i = 0lu; i < mom_site_size; i++) iter[i] = 0; +#pragma omp parallel for for (int i = 0; i < len; i++) { for (auto j = 0lu; j < mom_site_size - 1; j++) { int is = i * mom_site_size + j; double diff = fabs(momA[is] - momB[is]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[f]++; + } // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); - if (diff > 1e-3 || std::isnan(diff)) iter[j]++; + if (diff > 1e-3 || std::isnan(diff)) { +#pragma omp atomic + iter[j]++; + } } } diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 75e0d2ad1f..b788cc3081 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -49,15 +49,15 @@ void setQudaStaggeredInvTestParams(); //------------------------------------------------------ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, - int argc, char **argv, bool &gauge_loaded); + int argc, char **argv); void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv); + QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu); void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *, QudaDslashType dslash_type); void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param); void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff); void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, - void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik); + void *qudaGaugeParamPtr, std::array, 3> &act_path_coeffs, double eps_naik); void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param); void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk, quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, diff --git a/tests/utils/llfat_utils.cpp b/tests/utils/llfat_utils.cpp index 43d2acaffa..548c6976b1 100644 --- a/tests/utils/llfat_utils.cpp +++ b/tests/utils/llfat_utils.cpp @@ -29,10 +29,6 @@ template void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink, void **fatlink, Real coef, int use_staple) { - su3_matrix tmat1, tmat2; - int i; - su3_matrix *fat1; - /* Upper staple */ /* Computes the staple : * mu (B) @@ -46,16 +42,15 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr * It also adds the computed staple to the fatlink[mu] with weight coef. */ - int dx[4]; - /* upper staple */ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; su3_matrix *A = sitelink[nu] + i; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = 1; int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); su3_matrix *B; @@ -70,6 +65,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); su3_matrix *C = sitelink[nu] + nbr_idx; + su3_matrix tmat1, tmat2; llfat_mult_su3_nn(A, B, &tmat1); if (staple != NULL) { /* Save the staple */ @@ -89,10 +85,11 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr * *********************************************/ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { - fat1 = ((su3_matrix *)fatlink[mu]) + i; - memset(dx, 0, sizeof(dx)); + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; + int dx[4] = {}; dx[nu] = -1; int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); if (nbr_idx >= V || nbr_idx < 0) { @@ -113,6 +110,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2], dx[1], dx[0]); su3_matrix *C = sitelink[nu] + nbr_idx; + su3_matrix tmat1, tmat2; llfat_mult_su3_an(A, B, &tmat1); llfat_mult_su3_nn(&tmat1, C, &tmat2); @@ -148,6 +146,7 @@ void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff) for (int dir = XUP; dir <= TUP; dir++) { // Intialize fat links with c_1*U_\mu(x) +#pragma omp parallel for for (int i = 0; i < V; i++) { su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1); @@ -210,10 +209,6 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m su3_matrix **ghost_mulink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag, void **fatlink, Real coef, int use_staple) { - su3_matrix tmat1, tmat2; - int i; - su3_matrix *fat1; - int X1 = Z[0]; int X2 = Z[1]; int X3 = Z[2]; @@ -237,11 +232,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m * It also adds the computed staple to the fatlink[mu] with weight coef. */ - int dx[4]; - // upper staple - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { int half_index = i; int oddBit = 0; @@ -264,10 +258,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m int space_con[4] = {(x4 * X3X2 + x3 * X2 + x2) / 2, (x4 * X3X1 + x3 * X1 + x1) / 2, (x4 * X2X1 + x2 * X1 + x1) / 2, (x3 * X2X1 + x2 * X1 + x1) / 2}; - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; su3_matrix *A = sitelink[nu] + i; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = 1; int nbr_idx; @@ -299,6 +293,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m C = sitelink[nu] + nbr_idx; } + su3_matrix tmat1, tmat2; llfat_mult_su3_nn(A, B, &tmat1); if (staple != NULL) { /* Save the staple */ @@ -318,7 +313,8 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m * *********************************************/ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { int half_index = i; int oddBit = 0; @@ -342,11 +338,11 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m // int x4 = x4_from_full_index(i); - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; // we could be in the ghost link area if nu is T and we are at low T boundary su3_matrix *A; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = -1; int nbr_idx; @@ -412,6 +408,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m } else { C = sitelink[nu] + nbr_idx; } + su3_matrix tmat1, tmat2; llfat_mult_su3_an(A, B, &tmat1); llfat_mult_su3_nn(&tmat1, C, &tmat2); @@ -430,12 +427,7 @@ template void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag, Float *act_path_coeff) { - QudaPrecision prec; - if (sizeof(Float) == 4) { - prec = QUDA_SINGLE_PRECISION; - } else { - prec = QUDA_DOUBLE_PRECISION; - } + QudaPrecision prec = sizeof(Float) == 4 ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION; su3_matrix *staple = (su3_matrix *)safe_malloc(V * sizeof(su3_matrix)); @@ -455,6 +447,7 @@ void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_site for (int dir = XUP; dir <= TUP; dir++) { // Intialize fat links with c_1*U_\mu(x) +#pragma omp parallel for for (int i = 0; i < V; i++) { su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1); diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp index 2759e3489b..b020939c5f 100644 --- a/tests/utils/staggered_gauge_utils.cpp +++ b/tests/utils/staggered_gauge_utils.cpp @@ -23,8 +23,9 @@ static double max_allowed_error = 1e-11; // Wrap everything for the GPU construction of fat/long links here void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fatlink_eps, void **qdp_longlink_eps, - void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik, - size_t gSize, int n_naiks) + void **qdp_inlink, QudaGaugeParam &gauge_param_in, + std::array, 3> &act_path_coeffs, double eps_naik, size_t gSize, + int n_naiks) { // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in` auto gauge_param = gauge_param_in; @@ -52,11 +53,11 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0], &gauge_param); + computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0].data(), &gauge_param); if (n_naiks > 1) { // Create Naiks, 3rd path table set - computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2], &gauge_param); + computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2].data(), &gauge_param); // Rescale+copy Naiks into Naik field cpu_axy(gauge_param.cpu_prec, eps_naik, milc_fatlink, milc_fatlink_eps, V * 4 * gauge_site_size); @@ -67,7 +68,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } // Create X and long links, 2nd path table set - computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1], &gauge_param); + computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1].data(), &gauge_param); if (n_naiks > 1) { // Add into Naik field @@ -98,7 +99,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } } -void setActionPaths(double **act_paths) +template void setActionPaths(T &act_paths) { /////////////////////////// // Set path coefficients // @@ -160,8 +161,7 @@ void setActionPaths(double **act_paths) void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik) { - double **act_paths = new double *[3]; - for (int i = 0; i < 3; i++) act_paths[i] = new double[6]; + std::array, 3> act_paths; setActionPaths(act_paths); /////////////////////////////////////////////////////////////////////// @@ -196,17 +196,12 @@ void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlin host_free(qdp_longlink_naik_temp[dir]); } } - - for (int i = 0; i < 3; i++) delete[] act_paths[i]; - delete[] act_paths; } -void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, - void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, - int n_naiks, double eps_naik) +void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, + size_t gSize, int n_naiks, double eps_naik) { - double **act_paths = new double *[3]; - for (int i = 0; i < 3; i++) act_paths[i] = new double[6]; + std::array, 3> act_paths; setActionPaths(act_paths); /////////////////////////////////////////////////////////////////////// @@ -229,41 +224,26 @@ void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, vo ////////////////////////// // defined in "llfat_reference.cpp" - computeHISQLinksCPU(qdp_fatlink_cpu, qdp_longlink_cpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, + computeHISQLinksCPU(qdp_fatlink, qdp_longlink, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, &gauge_param, act_paths, eps_naik); if (n_naiks == 2) { // Override the naik fields into the fat/long link fields for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_cpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); - memcpy(qdp_longlink_cpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); - memset(qdp_fatlink_naik_temp[dir], 0, V * gauge_site_size * gSize); - memset(qdp_longlink_naik_temp[dir], 0, V * gauge_site_size * gSize); - } - } - - ////////////////////////// - // Create the GPU links // - ////////////////////////// - - // Skip eps field for now - // Note: GPU link creation only works for single and double precision - computeHISQLinksGPU(qdp_fatlink_gpu, qdp_longlink_gpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, - (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, gauge_param, act_paths, eps_naik, - gSize, n_naiks); - - if (n_naiks == 2) { - // Override the naik fields into the fat/long link fields - for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); - memcpy(qdp_longlink_gpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); + memcpy(qdp_fatlink[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); + memcpy(qdp_longlink[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); host_free(qdp_fatlink_naik_temp[dir]); host_free(qdp_longlink_naik_temp[dir]); } } +} - for (int i = 0; i < 3; i++) delete[] act_paths[i]; - delete[] act_paths; +void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, + void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, + int n_naiks, double eps_naik) +{ + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik); + computeFatLongCPU(qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik); } // Routine that takes in a QDP-ordered field and outputs the plaquette. diff --git a/tests/utils/staggered_gauge_utils.h b/tests/utils/staggered_gauge_utils.h index f2cc4e4749..d969d437e1 100644 --- a/tests/utils/staggered_gauge_utils.h +++ b/tests/utils/staggered_gauge_utils.h @@ -18,6 +18,9 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik); +void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, + size_t gSize, int n_naiks, double eps_naik); + void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik); diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index df1d787ff0..95efb0d85a 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -26,25 +26,24 @@ template using complex = std::complex; // Staggered gauge field utils //------------------------------------------------------ -void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, - void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, - int argc, char **argv, bool &gauge_loaded) +void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, + QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu) { + gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + // load a field WITHOUT PHASES if (latfile.size() > 0) { - if (!gauge_loaded) { - read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); - if (dslash_type != QUDA_LAPLACE_DSLASH) { - applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); - } - gauge_loaded = true; - } // else it's already been loaded + // load in the command line supplied gauge field using QIO and LIME + read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); + if (dslash_type != QUDA_LAPLACE_DSLASH) { + applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); + } } else { int construct_type = (unit_gauge) ? 0 : 1; if (dslash_type == QUDA_LAPLACE_DSLASH) { constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param); } else { - constructFatLongGaugeField(qdp_inlink, qdp_longlink_cpu, construct_type, gauge_param.cpu_prec, &gauge_param, + constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param, compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type); } } @@ -53,62 +52,49 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli // "compute" the fat/long links or not. if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink_cpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size); } } else { // QUDA_ASQTAD_DSLASH if (compute_fatlong) { - computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, - gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + if (compute_on_gpu) + computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, + eps_naik); + else + computeFatLongCPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, + eps_naik); } else { - // Not computing FatLong for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); } } } } -void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv) +void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, + void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, + int argc, char **argv) { - gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - - if (latfile.size() > 0) { - // load in the command line supplied gauge field using QIO and LIME - read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); - if (dslash_type != QUDA_LAPLACE_DSLASH) { - applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); - } - } else { - int construct_type = (unit_gauge) ? 0 : 1; - if (dslash_type == QUDA_LAPLACE_DSLASH) { - constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param); - } else { - constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param, - compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type); - } - } + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, false); // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you // "compute" the fat/long links or not. if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); } } else { // QUDA_ASQTAD_DSLASH if (compute_fatlong) { - computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, + eps_naik); } else { + // Not computing FatLong for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); } } } @@ -188,6 +174,7 @@ void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaP // FIXME: may break host comparison if (dslash_type == QUDA_STAGGERED_DSLASH) { for (int dir = 0; dir < 4; ++dir) { +#pragma omp parallel for for (int i = 0; i < V; ++i) { for (auto j = 0lu; j < gauge_site_size; j += 2) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -252,11 +239,12 @@ void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugePara template void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff) { - su3_matrix temp; for (int dir = XUP; dir <= TUP; ++dir) { - int dx[4] = {0, 0, 0, 0}; +#pragma omp parallel for for (int i = 0; i < V; ++i) { + int dx[4] = {0, 0, 0, 0}; // Initialize the longlinks + su3_matrix temp; su3_matrix *llink = ((su3_matrix *)longlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, act_path_coeff[1], llink); dx[dir] = 1; @@ -267,7 +255,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_ llfat_mult_su3_nn(&temp, sitelink[dir] + nbr_idx, llink); } } - return; } #else @@ -278,7 +265,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4; const int extended_volume = E[3] * E[2] * E[1] * E[0]; - su3_matrix temp; +#pragma omp parallel for for (int t = 0; t < Z[3]; ++t) { for (int z = 0; z < Z[2]; ++z) { for (int y = 0; y < Z[1]; ++y) { @@ -294,6 +281,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat llfat_scalar_mult_su3_matrix(sitelinkEx[dir] + large_index, act_path_coeff[1], llink); dx[dir] = 1; int nbr_index = neighborIndexFullLattice(E, large_index, dx); + su3_matrix temp; llfat_mult_su3_nn(llink, sitelinkEx[dir] + nbr_index, &temp); dx[dir] = 2; nbr_index = neighborIndexFullLattice(E, large_index, dx); @@ -303,7 +291,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat } // y } // z } // t - return; } #endif @@ -399,6 +386,7 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost } { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { // Get local time-slice index: const int local_t = i / Vsh_t; @@ -484,7 +472,7 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda // If "eps_naik" is 0, there's no naik correction, // and this routine skips building the paths in "act_path_coeffs[2]" void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, - void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik) + void *qudaGaugeParamPtr, std::array, 3> &act_path_coeffs, double eps_naik) { // Prepare various things QudaGaugeParam &qudaGaugeParam = *((QudaGaugeParam *)qudaGaugeParamPtr); @@ -517,6 +505,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo int X3 = Z[2]; int X4 = Z[3]; +#pragma omp parallel for for (int i = 0; i < V_ex; i++) { int sid = i; int oddBit = 0; @@ -656,6 +645,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo // Prepare for extended W fields // /////////////////////////////////// +#pragma omp parallel for for (int i = 0; i < V_ex; i++) { int sid = i; int oddBit = 0; @@ -872,6 +862,7 @@ void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPr template void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize) { +#pragma omp parallel for for (int i = 0; i < V; i++) { for (int dir = 0; dir < 4; dir++) { for (int j = 0; j < siteSize; j++) { @@ -926,6 +917,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q for (int d = 0; d < 3; d++) { // even +#pragma omp parallel for for (int i = 0; i < Vh; i++) { int index = fullLatticeIndex(i, 0); @@ -974,6 +966,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q // Apply boundary conditions to temporal links if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) { +#pragma omp parallel for for (int j = 0; j < Vh; j++) { int sign = 1; if (dslash_type == QUDA_ASQTAD_DSLASH) {