From dccc14868955f8a34f29a4fb45b183f23bd91381 Mon Sep 17 00:00:00 2001 From: Mark Dewing Date: Fri, 13 Oct 2023 15:53:29 +0000 Subject: [PATCH] Workaround for assignment-after-reduction bug Loop index over nw must be 32 bits in size. Bug affects offload with NVidia. See https://github.com/QMCPACK/qmcpack/issues/4767 --- src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.2.cpp | 8 +++++--- .../Fermion/MultiSlaterDetTableMethod.cpp | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.2.cpp b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.2.cpp index 643e2204d0..9f8ee6c62f 100644 --- a/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.2.cpp +++ b/src/QMCWaveFunctions/Fermion/MultiDiracDeterminant.2.cpp @@ -443,7 +443,7 @@ void MultiDiracDeterminant::mw_evaluateDetsForPtclMove(const RefVectorWithLeader PRAGMA_OFFLOAD("omp target teams distribute map(always, from:curRatio_list_ptr[:nw]) \ is_device_ptr(psiV_list_devptr, psiMinv_temp_list_devptr)") - for (size_t iw = 0; iw < nw; iw++) + for (uint32_t iw = 0; iw < nw; iw++) { ValueType c_ratio = 0.0; PRAGMA_OFFLOAD("omp parallel for reduction(+ : c_ratio)") @@ -780,9 +780,11 @@ void MultiDiracDeterminant::mw_evaluateDetsAndGradsForPtclMove( throw std::runtime_error("In MultiDiracDeterminant ompBLAS::copy_batched_offset failed."); + // Index of loop over nw must be 32 bit sized to avoid assignment-after-reduction offload bug + // See https://github.com/QMCPACK/qmcpack/issues/4767 PRAGMA_OFFLOAD("omp target teams distribute is_device_ptr(psiV_list_devptr, psiMinv_temp_list_devptr) \ map(always, from:curRatio_list_ptr[:nw])") - for (size_t iw = 0; iw < nw; iw++) + for (uint32_t iw = 0; iw < nw; iw++) { GradType ratioGradRef_local(0); PRAGMA_OFFLOAD("omp parallel for reduction(+ : ratioGradRef_local)") @@ -1048,7 +1050,7 @@ void MultiDiracDeterminant::mw_evaluateGrads(const RefVectorWithLeader