Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:lattice/quda into feature/mrhs-misc
Browse files Browse the repository at this point in the history
  • Loading branch information
maddyscientist committed Nov 22, 2024
2 parents 26c3e59 + 9fa8615 commit 1af8c52
Show file tree
Hide file tree
Showing 11 changed files with 72 additions and 49 deletions.
2 changes: 2 additions & 0 deletions include/eigensolve_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ namespace quda
*/
class TRLM3D : public EigenSolver
{
bool verbose_rank = false; /** Whether this rank is one that logs */

public:
/**
@brief Constructor for Thick Restarted Eigensolver class
Expand Down
6 changes: 4 additions & 2 deletions include/kernels/blas_3d.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <fast_intdiv.h>
#include <quda_matrix.h>
#include <matrix_field.h>
#include <constant_kernel_arg.h>
#include <reduce_helper.h>
#include <kernel.h>

namespace quda
Expand Down Expand Up @@ -118,7 +120,7 @@ namespace quda
// Create a typename F for the ColorSpinorFields
typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;

static constexpr int MAX_ORTHO_DIM = 128;
static constexpr int MAX_ORTHO_DIM = 256;
real a[MAX_ORTHO_DIM];
const F x;
real b[MAX_ORTHO_DIM];
Expand Down Expand Up @@ -173,7 +175,7 @@ namespace quda
// Create a typename F for the ColorSpinorFields
typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;

static constexpr int MAX_ORTHO_DIM = 64;
static constexpr int MAX_ORTHO_DIM = 256;
complex<real> a[MAX_ORTHO_DIM];
const F x;
complex<real> b[MAX_ORTHO_DIM];
Expand Down
6 changes: 2 additions & 4 deletions lib/blas_3d.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#include <color_spinor_field.h>
#include <contract_quda.h>

#include <kernels/blas_3d.cuh>
#include <tunable_nd.h>
#include <tunable_reduction.h>
#include <instantiate.h>
#include <kernels/blas_3d.cuh>
#include <blas_3d.h>
#include <instantiate.h>

namespace quda
{
Expand Down
34 changes: 20 additions & 14 deletions lib/eig_trlm_3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace quda
ortho_dim_size = eig_param->ortho_dim_size_local;
if (ortho_dim != 3)
errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim);
verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0;

// Tridiagonal/Arrow matrices
alpha_3D.resize(ortho_dim_size);
Expand Down Expand Up @@ -200,7 +201,7 @@ namespace quda
if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t];

// Use printf to get data from t dim only
if (getVerbosity() >= QUDA_DEBUG_VERBOSE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) {
printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t],
t_offset + t, restart_iter + 1);
printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]);
Expand All @@ -226,7 +227,7 @@ namespace quda
}
}

if (getVerbosity() >= QUDA_VERBOSE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) {
std::stringstream converged;
std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator<int>(converged, ""));
printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1,
Expand Down Expand Up @@ -258,17 +259,21 @@ namespace quda
computeEvals(kSpace, evals);
}
} else {
if (getVerbosity() >= QUDA_SUMMARIZE && comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
comm_rank_global(), n_conv, restart_iter, iter);
}
if (verbose_rank) {
if (getVerbosity() >= QUDA_SUMMARIZE) {
printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
comm_rank_global(), n_conv, restart_iter, iter);
}

// Dump all Ritz values and residua if using Chebyshev
if (eig_param->use_poly_acc) {
for (int t = 0; t < ortho_dim_size; t++) {
for (int i = 0; i < n_conv; i++) {
logQuda(QUDA_VERBOSE, "RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
residua_3D[t][i]);
if (getVerbosity() >= QUDA_VERBOSE) {
// Dump all Ritz values and residua if using Chebyshev
if (eig_param->use_poly_acc) {
for (int t = 0; t < ortho_dim_size; t++) {
for (int i = 0; i < n_conv; i++) {
printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
residua_3D[t][i]);
}
}
}
}
}
Expand Down Expand Up @@ -600,7 +605,8 @@ namespace quda

auto result = *std::max_element(inner_products.begin(), inner_products.end());
comm_allreduce_max(result);
logQuda(QUDA_VERBOSE, "Chebyshev max %e\n", result);
if (verbose_rank && getVerbosity() >= QUDA_VERBOSE)
printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global());

// Increase final result by 10% for safety
return result * 1.10;
Expand Down Expand Up @@ -642,7 +648,7 @@ namespace quda
if (size == n_conv) {
evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0);

if (comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0) {
if (verbose_rank) {
int t_offset = ortho_dim_size * comm_coord_global(3);
for (int t = 0; t < ortho_dim_size; t++) {
for (int i = 0; i < size; i++) {
Expand Down
9 changes: 4 additions & 5 deletions lib/interface_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2403,10 +2403,9 @@ void checkClover(QudaInvertParam *param) {
if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
}

quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = false)
quda::GaugeField *checkGauge(QudaInvertParam *param)
{
quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise :
use_smeared_gauge ? gaugeSmeared :
gaugePrecise;

if (U == nullptr)
Expand All @@ -2416,7 +2415,7 @@ quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = fa
errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision());
}

if (param->dslash_type != QUDA_ASQTAD_DSLASH && !use_smeared_gauge) {
if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
|| param->cuda_prec_precondition != gaugePrecondition->Precision()
|| param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()
Expand All @@ -2434,7 +2433,7 @@ quda::GaugeField *checkGauge(QudaInvertParam *param, bool use_smeared_gauge = fa
if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");
if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
} else if (!use_smeared_gauge) {
} else {
if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");

if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
Expand Down Expand Up @@ -2565,7 +2564,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
checkEigParam(eig_param);

// Check that the gauge field is valid
GaugeField *cudaGauge = checkGauge(inv_param, eig_param->use_smeared_gauge);
GaugeField *cudaGauge = checkGauge(inv_param);

// Set iter statistics to zero
inv_param->iter = 0;
Expand Down
12 changes: 9 additions & 3 deletions lib/targets/hip/target_hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,15 @@ set(CMAKE_HIP_FLAGS_RELEASE
set(CMAKE_HIP_FLAGS_HOSTDEBUG
"-g"
CACHE STRING "Flags used by the C++ compiler during host-debug builds.")
set(CMAKE_HIP_FLAGS_DEBUG
"-g -G"
CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.")
if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
set(CMAKE_HIP_FLAGS_DEBUG
"-g -G"
CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for NVIDIA).")
else()
set(CMAKE_HIP_FLAGS_DEBUG
"-g"
CACHE STRING "Flags used by the HIP compiler during debug builds (with device debug support for Clang).")
endif()
set(CMAKE_HIP_FLAGS_SANITIZE
"-g "
CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.")
Expand Down
33 changes: 19 additions & 14 deletions tests/host_reference/dslash_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std:
else
sol_type = QUDA_MATDAG_MAT_SOLUTION;
} else {
if (use_pc)
sol_type = QUDA_MATPC_SOLUTION;
else
sol_type = QUDA_MAT_SOLUTION;
sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
}

// Create temporary spinors
Expand All @@ -884,30 +881,38 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std:
}

if (laplace3D == 3) {
int t_offset = spinor.X()[3] * comm_coord(3);
std::vector<double> nrm2(spinor.X()[3], 0.0);
std::vector<double> src2(spinor.X()[3], 0.0);
auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
auto t_offset = spinor.X()[3] * comm_coord(3);
std::vector<double> nrm2(spinor.X()[3] * comm_dim(3), 0.0);
std::vector<double> src2(spinor.X()[3] * comm_dim(3), 0.0);
// Compute M * x - \lambda * x on each slice
for (auto t = 0; t < spinor.X()[3]; t++) {
auto t_global = t_offset + t;
auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
auto offset = t * batch_size * inv_param.cpu_prec;

for (int parity = 0; parity < spinor.SiteSubset(); parity++) {
caxpy(-lambda[t_global], static_cast<char *>(spinor.data()) + offset, static_cast<char *>(ref.data()) + offset,
batch_size, inv_param.cpu_prec);

nrm2[t] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
src2[t] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);
nrm2[t_global] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
src2[t_global] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);

offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec;
}
}

auto l = ((double *)&(lambda[t_global]))[0];
printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t_global, l,
eig_param.tol, sqrt(nrm2[t] / src2[t]));
comm_allreduce_sum(nrm2);
comm_allreduce_sum(src2);

for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) {
auto l = ((double *)&(lambda[t]))[0];
printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol,
sqrt(nrm2[t] / src2[t]));
}
return sqrt(nrm2[0] / src2[0]);
auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0);
auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0);

return sqrt(total_nrm2 / total_src2);
} else {
// Compute M * x - \lambda * x
caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
Expand Down
3 changes: 2 additions & 1 deletion tests/host_reference/staggered_dslash_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D);

if (dslash_type == QUDA_LAPLACE_DSLASH) {
double kappa = 1.0 / ((laplace3D < 4 ? 6 : 8) + mass);
int dimension = laplace3D < 4 ? 3 : 4;
double kappa = 1.0 / (2 * dimension + mass);
xpay(in.data(), kappa, out.data(), out.Length(), out.Precision());
} else {
axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision());
Expand Down
5 changes: 3 additions & 2 deletions tests/staggered_eigensolve_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,10 @@ std::vector<double> eigensolve(test_t test_param)
eig_param.use_smeared_gauge = gauge_smear;

if (dslash_type == QUDA_LAPLACE_DSLASH) {
eig_inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass);
int dimension = laplace3D < 4 ? 3 : 4;
eig_inv_param.kappa = 1.0 / (2 * dimension + mass);
eig_inv_param.laplace3D = laplace3D;
if (laplace3D == 3) {
if (dimension == 3) {
eig_param.ortho_dim = laplace3D;
eig_param.ortho_dim_size_local = tdim;
}
Expand Down
3 changes: 2 additions & 1 deletion tests/staggered_invert_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,8 @@ int main(int argc, char **argv)
if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
} else {
if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
if (is_laplace(dslash_type))
errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
}

Expand Down
8 changes: 5 additions & 3 deletions tests/utils/set_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param)
{
// Set dslash type
inv_param.dslash_type = dslash_type;
int dimension = laplace3D < 4 ? 3 : 4;

// Use kappa or mass normalisation
if (kappa == -1.0) {
inv_param.mass = mass;
inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass));
if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass);
if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass);
} else {
inv_param.kappa = kappa;
inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy);
if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - (laplace3D == 3 ? 6 : 8);
if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension;
}
if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass);
Expand Down Expand Up @@ -939,7 +940,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
// Solver params
inv_param.verbosity = verbosity;
inv_param.mass = mass;
inv_param.kappa = 1.0 / ((laplace3D == 3 ? 6 : 8) + mass); // for Laplace operator
int dimension = laplace3D < 4 ? 3 : 4;
inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator
inv_param.laplace3D = laplace3D; // for Laplace operator

if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0)
Expand Down

0 comments on commit 1af8c52

Please sign in to comment.