Merge branch 'develop' of github.com:lattice/quda into feature/mrhs-misc

lattice · Nov 22, 2024 · 1af8c52 · 1af8c52
2 parents 26c3e59 + 9fa8615
commit 1af8c52
Show file tree

Hide file tree

Showing 11 changed files with 72 additions and 49 deletions.
diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h
@@ -454,6 +454,8 @@ namespace quda
   */
   class TRLM3D : public EigenSolver
   {
+    bool verbose_rank = false; /** Whether this rank is one that logs */
+
   public:
     /**
        @brief Constructor for Thick Restarted Eigensolver class

diff --git a/include/kernels/blas_3d.cuh b/include/kernels/blas_3d.cuh
@@ -5,6 +5,8 @@
 #include <fast_intdiv.h>
 #include <quda_matrix.h>
 #include <matrix_field.h>
+#include <constant_kernel_arg.h>
+#include <reduce_helper.h>
 #include <kernel.h>
 
 namespace quda
@@ -118,7 +120,7 @@ namespace quda
     // Create a typename F for the ColorSpinorFields
     typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
 
-    static constexpr int MAX_ORTHO_DIM = 128;
+    static constexpr int MAX_ORTHO_DIM = 256;
     real a[MAX_ORTHO_DIM];
     const F x;
     real b[MAX_ORTHO_DIM];
@@ -173,7 +175,7 @@ namespace quda
     // Create a typename F for the ColorSpinorFields
     typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
 
-    static constexpr int MAX_ORTHO_DIM = 64;
+    static constexpr int MAX_ORTHO_DIM = 256;
     complex<real> a[MAX_ORTHO_DIM];
     const F x;
     complex<real> b[MAX_ORTHO_DIM];

diff --git a/lib/blas_3d.cu b/lib/blas_3d.cu
@@ -1,11 +1,9 @@
 #include <color_spinor_field.h>
-#include <contract_quda.h>
-
+#include <kernels/blas_3d.cuh>
 #include <tunable_nd.h>
 #include <tunable_reduction.h>
-#include <instantiate.h>
-#include <kernels/blas_3d.cuh>
 #include <blas_3d.h>
+#include <instantiate.h>
 
 namespace quda
 {

diff --git a/lib/eig_trlm_3d.cpp b/lib/eig_trlm_3d.cpp
@@ -29,6 +29,7 @@ namespace quda
     ortho_dim_size = eig_param->ortho_dim_size_local;
     if (ortho_dim != 3)
       errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim);
+    verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0;
 
     // Tridiagonal/Arrow matrices
     alpha_3D.resize(ortho_dim_size);
@@ -200,7 +201,7 @@ namespace quda
           if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t];
 
           // Use printf to get data from t dim only
-          if (getVerbosity() >= QUDA_DEBUG_VERBOSE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
+          if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) {
             printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t],
                    t_offset + t, restart_iter + 1);
             printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]);
@@ -226,7 +227,7 @@ namespace quda
         }
       }
 
-      if (getVerbosity() >= QUDA_VERBOSE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
+      if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) {
         std::stringstream converged;
         std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator<int>(converged, ""));
         printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1,
@@ -258,17 +259,21 @@ namespace quda
         computeEvals(kSpace, evals);
       }
     } else {
-      if (getVerbosity() >= QUDA_SUMMARIZE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
-        printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
-               comm_rank_global(), n_conv, restart_iter, iter);
-      }
+      if (verbose_rank) {
+        if (getVerbosity() >= QUDA_SUMMARIZE) {
+          printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
+                 comm_rank_global(), n_conv, restart_iter, iter);
+        }
 
-      // Dump all Ritz values and residua if using Chebyshev
-      if (eig_param->use_poly_acc) {
-        for (int t = 0; t < ortho_dim_size; t++) {
-          for (int i = 0; i < n_conv; i++) {
-            logQuda(QUDA_VERBOSE, "RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
-                    residua_3D[t][i]);
+        if (getVerbosity() >= QUDA_VERBOSE) {
+          // Dump all Ritz values and residua if using Chebyshev
+          if (eig_param->use_poly_acc) {
+            for (int t = 0; t < ortho_dim_size; t++) {
+              for (int i = 0; i < n_conv; i++) {
+                printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
+                       residua_3D[t][i]);
+              }
+            }
           }
         }
       }
@@ -600,7 +605,8 @@ namespace quda
 
     auto result = *std::max_element(inner_products.begin(), inner_products.end());
     comm_allreduce_max(result);
-    logQuda(QUDA_VERBOSE, "Chebyshev max %e\n", result);
+    if (verbose_rank && getVerbosity() >= QUDA_VERBOSE)
+      printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global());
 
     // Increase final result by 10% for safety
     return result * 1.10;
@@ -642,7 +648,7 @@ namespace quda
     if (size == n_conv) {
       evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0);
 
-      if (comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
+      if (verbose_rank) {
         int t_offset = ortho_dim_size * comm_coord_global(3);
         for (int t = 0; t < ortho_dim_size; t++) {
           for (int i = 0; i < size; i++) {

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
@@ -2403,10 +2403,9 @@ void checkClover(QudaInvertParam *param) {
   if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
 }
 
-quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = false)
+quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
   quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise :
-    use_smeared_gauge                                            ? gaugeSmeared :
                                                                    gaugePrecise;
 
   if (U == nullptr)
@@ -2416,7 +2415,7 @@ quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = fa
     errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision());
   }
 
-  if (param->dslash_type != QUDA_ASQTAD_DSLASH && !use_smeared_gauge) {
+  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
         || param->cuda_prec_precondition != gaugePrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()
@@ -2434,7 +2433,7 @@ quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = fa
     if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
     if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");
     if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
-  } else if (!use_smeared_gauge) {
+  } else {
     if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");
 
     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
@@ -2565,7 +2564,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   checkEigParam(eig_param);
 
   // Check that the gauge field is valid
-  GaugeField *cudaGauge = checkGauge(inv_param, eig_param->use_smeared_gauge);
+  GaugeField *cudaGauge = checkGauge(inv_param);
 
   // Set iter statistics to zero
   inv_param->iter = 0;

diff --git a/lib/targets/hip/target_hip.cmake b/lib/targets/hip/target_hip.cmake
@@ -54,9 +54,15 @@ set(CMAKE_HIP_FLAGS_RELEASE
 set(CMAKE_HIP_FLAGS_HOSTDEBUG
     "-g"
     CACHE STRING "Flags used by the C++ compiler during host-debug builds.")
-set(CMAKE_HIP_FLAGS_DEBUG
-    "-g -G"
-    CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.")
+if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    set(CMAKE_HIP_FLAGS_DEBUG
+        "-g -G"
+        CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for NVIDIA).")
+else()
+    set(CMAKE_HIP_FLAGS_DEBUG
+        "-g"
+        CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for Clang).")
+endif()
 set(CMAKE_HIP_FLAGS_SANITIZE
     "-g "
     CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.")

diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
@@ -859,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std:
     else
       sol_type = QUDA_MATDAG_MAT_SOLUTION;
   } else {
-    if (use_pc)
-      sol_type = QUDA_MATPC_SOLUTION;
-    else
-      sol_type = QUDA_MAT_SOLUTION;
+    sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
   }
 
   // Create temporary spinors
@@ -884,30 +881,38 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std:
   }
 
   if (laplace3D == 3) {
-    int t_offset = spinor.X()[3] * comm_coord(3);
-    std::vector<double> nrm2(spinor.X()[3], 0.0);
-    std::vector<double> src2(spinor.X()[3], 0.0);
+    auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
+    auto t_offset = spinor.X()[3] * comm_coord(3);
+    std::vector<double> nrm2(spinor.X()[3] * comm_dim(3), 0.0);
+    std::vector<double> src2(spinor.X()[3] * comm_dim(3), 0.0);
     // Compute M * x - \lambda * x on each slice
     for (auto t = 0; t < spinor.X()[3]; t++) {
       auto t_global = t_offset + t;
-      auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
       auto offset = t * batch_size * inv_param.cpu_prec;
 
       for (int parity = 0; parity < spinor.SiteSubset(); parity++) {
         caxpy(-lambda[t_global], static_cast<char *>(spinor.data()) + offset, static_cast<char *>(ref.data()) + offset,
               batch_size, inv_param.cpu_prec);
 
-        nrm2[t] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
-        src2[t] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);
+        nrm2[t_global] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
+        src2[t_global] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);
 
         offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec;
       }
+    }
 
-      auto l = ((double *)&(lambda[t_global]))[0];
-      printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t_global, l,
-                 eig_param.tol, sqrt(nrm2[t] / src2[t]));
+    comm_allreduce_sum(nrm2);
+    comm_allreduce_sum(src2);
+
+    for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) {
+      auto l = ((double *)&(lambda[t]))[0];
+      printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol,
+                 sqrt(nrm2[t] / src2[t]));
     }
-    return sqrt(nrm2[0] / src2[0]);
+    auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0);
+    auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0);
+
+    return sqrt(total_nrm2 / total_src2);
   } else {
     // Compute M * x - \lambda * x
     caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);

diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
@@ -179,7 +179,8 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
   stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D);
 
   if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    double kappa = 1.0 / ((laplace3D < 4 ? 6 : 8) + mass);
+    int dimension = laplace3D < 4 ? 3 : 4;
+    double kappa = 1.0 / (2 * dimension + mass);
     xpay(in.data(), kappa, out.data(), out.Length(), out.Precision());
   } else {
     axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision());

diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
@@ -166,9 +166,10 @@ std::vector<double> eigensolve(test_t test_param)
   eig_param.use_smeared_gauge = gauge_smear;
 
   if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    eig_inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass);
+    int dimension = laplace3D < 4 ? 3 : 4;
+    eig_inv_param.kappa = 1.0 / (2 * dimension + mass);
     eig_inv_param.laplace3D = laplace3D;
-    if (laplace3D == 3) {
+    if (dimension == 3) {
       eig_param.ortho_dim = laplace3D;
       eig_param.ortho_dim_size_local = tdim;
     }

diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
@@ -510,7 +510,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
+    if (is_laplace(dslash_type))
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
     if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 

diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
@@ -129,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param)
 {
   // Set dslash type
   inv_param.dslash_type = dslash_type;
+  int dimension = laplace3D < 4 ? 3 : 4;
 
   // Use kappa or mass normalisation
   if (kappa == -1.0) {
     inv_param.mass = mass;
     inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass));
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass);
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass);
   } else {
     inv_param.kappa = kappa;
     inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy);
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - (laplace3D == 3 ? 6 : 8);
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension;
   }
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
     printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass);
@@ -939,7 +940,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   // Solver params
   inv_param.verbosity = verbosity;
   inv_param.mass = mass;
-  inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass); // for Laplace operator
+  int dimension = laplace3D < 4 ? 3 : 4;
+  inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator
   inv_param.laplace3D = laplace3D;              // for Laplace operator
 
   if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0)
-Original file line number
+Diff line change
@@ Expand Up / @@ -454,6 +454,8 @@ namespace quda @@
       */
       class TRLM3D : public EigenSolver
       {
+        bool verbose_rank = false; /** Whether this rank is one that logs */
       public:
         /**
            @brief Constructor for Thick Restarted Eigensolver class
@@ Expand Down @@