diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 35c865d57..fe165b530 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -23,21 +23,14 @@ namespace cms { T const *__restrict__ v, uint32_t const *__restrict__ offsets) const { const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < nt; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->count(acc, v[i], ih); - } - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->count(acc, v[i], ih); + }); } }; @@ -49,22 +42,14 @@ namespace cms { T const *__restrict__ v, uint32_t const *__restrict__ offsets) const { const uint32_t nt = offsets[nh]; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(nt)); - - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < nt; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, nt); ++i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - h->fill(acc, v[i], i, ih); - } - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + h->fill(acc, v[i], i, ih); + }); } }; @@ -72,12 +57,8 @@ namespace cms { template ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void operator()(const T_Acc &acc, Histo *__restrict__ h) const { - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(Histo::totbins())); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { - h->off[i] = 0; - } + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid( + acc, Histo::totbins(), [&](uint32_t i) { h->off[i] = 0; }); } }; @@ -273,17 +254,7 @@ namespace cms { return; } - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto &[firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(totbins())); - - for (uint32_t threadIdx = m + firstElementIdxNoStride[0u], endElementIdx = m + endElementIdxNoStride[0u]; - threadIdx < totbins(); - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, totbins()); ++i) { - off[i] = n; - } - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, totbins(), m, [&](uint32_t i) { off[i] = n; }); } template diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index 392b01be0..cd1b3461c 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -29,50 +29,268 @@ namespace cms { } /* - * Computes the range of the element(s) global index(es) in grid. + * 1D helper to only access 1 element per block + * (should obviously only be needed for debug / printout). + */ + template + ALPAKA_FN_ACC bool once_per_block_1D(const T_Acc& acc, uint32_t i) { + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + return (i % blockDimension == 0); + } + + /* + * Computes the range of the elements indexes, local to the block. * Warning: the max index is not truncated by the max number of elements of interest. */ template > - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range(const T_Acc& acc) { - Vec firstElementIdxGlobalVec = Vec::zeros(); - Vec endElementIdxUncutGlobalVec = Vec::zeros(); + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_block(const T_Acc& acc, + const Vec& elementIdxShift) { + Vec firstElementIdxVec = Vec::zeros(); + Vec endElementIdxUncutVec = Vec::zeros(); + // Loop on all grid dimensions. for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { - // Global thread index in grid (along dimension dimIndex). - const uint32_t threadIdxGlobal(alpaka::idx::getIdx(acc)[dimIndex]); + // Take into account the thread index in block. + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[dimIndex]); const uint32_t threadDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); - // Global element index in grid (along dimension dimIndex). + // Compute the elements indexes in block. // Obviously relevant for CPU only. - // For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal. - const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension; - const uint32_t endElementIdxUncutGlobal = firstElementIdxGlobal + threadDimension; + // For GPU, threadDimension = 1, and elementIdx = firstElementIdx = threadIdx + elementIdxShift. + const uint32_t firstElementIdxLocal = threadIdxLocal * threadDimension; + const uint32_t firstElementIdx = firstElementIdxLocal + elementIdxShift[dimIndex]; // Add the shift! + const uint32_t endElementIdxUncut = firstElementIdx + threadDimension; - firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal; - endElementIdxUncutGlobalVec[dimIndex] = endElementIdxUncutGlobal; + firstElementIdxVec[dimIndex] = firstElementIdx; + endElementIdxUncutVec[dimIndex] = endElementIdxUncut; } - return {firstElementIdxGlobalVec, endElementIdxUncutGlobalVec}; + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdxVec, endElementIdxUncutVec}; } /* - * Computes the range of the element(s) global index(es) in grid. + * Computes the range of the elements indexes, local to the block. * Truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated( - const T_Acc& acc, const Vec& maxNumberOfElements) { + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_block_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements, const Vec& elementIdxShift) { + // Check dimension static_assert(alpaka::dim::Dim::value == T_Dim::value, "Accelerator and maxNumberOfElements need to have same dimension."); - auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_global_index_range(acc); + auto&& [firstElementIdxLocalVec, endElementIdxLocalVec] = element_index_range_in_block(acc, elementIdxShift); + + // Truncate + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + endElementIdxLocalVec[dimIndex] = std::min(endElementIdxLocalVec[dimIndex], maxNumberOfElements[dimIndex]); + } + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocalVec, endElementIdxLocalVec}; + } + /* + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template > + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid(const T_Acc& acc, + Vec& elementIdxShift) { + // Loop on all grid dimensions. + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + // Take into account the block index in grid. + const uint32_t blockIdxInGrid(alpaka::idx::getIdx(acc)[dimIndex]); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift[dimIndex] += blockIdxInGrid * blockDimension; + } + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift); + } + + /* + * Computes the range of the elements indexes in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements, Vec& elementIdxShift) { + // Check dimension + static_assert(alpaka::dim::Dim::value == T_Dim::value, + "Accelerator and maxNumberOfElements need to have same dimension."); + auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_index_range_in_grid(acc, elementIdxShift); + + // Truncate for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]); } + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; } + /* + * Computes the range of the element(s) index(es) in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements) { + Vec elementIdxShift = Vec::zeros(); + return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift); + } + + /********************************************* + * 1D HELPERS, LOOP ON ALL CPU ELEMENTS + ********************************************/ + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func func) { + const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_index_range_in_block_truncated( + acc, Vec1::all(maxNumberOfElements), Vec1::all(elementIdxShift)); + + for (uint32_t elementIdx = firstElementIdx[0u]; elementIdx < endElementIdx[0u]; ++elementIdx) { + func(elementIdx); + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are expressed in GRID 'frame-of-reference'. + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc, + const uint32_t maxNumberOfElements, + uint32_t elementIdxShift, + const Func func) { + // Take into account the block index in grid to compute the element indices. + const uint32_t blockIdxInGrid(alpaka::idx::getIdx(acc)[0u]); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + elementIdxShift += blockIdxInGrid * blockDimension; + + for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, maxNumberOfElements, elementIdxShift, func); + } + + /****************************************************************************** + * 1D HELPERS, LOOP ON ALL CPU ELEMENTS, AND ELEMENT/THREAD STRIDED ACCESS + ******************************************************************************/ + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by BLOCK size. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func func) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_block(acc, Vec1::all(elementIdxShift)); + + // Stride = block size. + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Strided access. + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_1D_block_stride(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by GRID size. + * Indexes are local to the GRID. + */ + template + ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func func) { + Vec1 elementIdxShiftVec = Vec1::all(elementIdxShift); + + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_grid(acc, elementIdxShiftVec); + + // Stride = grid size. + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Strided access. + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func); + } + } // namespace alpakatools } // namespace cms diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index 829d949fa..a54b327d9 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -2,16 +2,16 @@ #define HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h #include +#include "CUDACore/CMSUnrollLoop.h" #include "AlpakaCore/alpakaConfig.h" #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED template -ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan( - uint32_t laneId, T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { +ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { // ci and co may be the same auto x = ci[i]; -#pragma unroll + CMS_UNROLL_LOOP for (int offset = 1; offset < 32; offset <<= 1) { auto y = __shfl_up_sync(mask, x, offset); if (laneId >= offset) @@ -23,7 +23,7 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan( template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, uint32_t i, uint32_t mask) { auto x = c[i]; -#pragma unroll + CMS_UNROLL_LOOP for (int offset = 1; offset < 32; offset <<= 1) { auto y = __shfl_up_sync(mask, x, offset); if (laneId >= offset) @@ -39,8 +39,8 @@ namespace cms { // limited to 32*32 elements.... template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const T_Acc& acc, - T const* __restrict__ ci, - T* __restrict__ co, + T const* ci, + T* co, uint32_t size, T* ws #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -48,7 +48,6 @@ namespace cms { #endif ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const gridBlockIdx(alpaka::idx::getIdx(acc)[0u]); uint32_t const blockThreadIdx(alpaka::idx::getIdx(acc)[0u]); @@ -79,6 +78,7 @@ namespace cms { co[i] += ws[warpId - 1]; } alpaka::block::sync::syncBlockThreads(acc); + #else co[0] = ci[0]; for (uint32_t i = 1; i < size; ++i) @@ -142,7 +142,6 @@ namespace cms { uint32_t const threadDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const blockIdx(alpaka::idx::getIdx(acc)[0u]); - uint32_t const threadIdx(alpaka::idx::getIdx(acc)[0u]); auto&& ws = alpaka::block::shared::st::allocVar(acc); // first each block does a scan of size 1024; (better be enough blocks....) @@ -161,7 +160,6 @@ namespace cms { uint32_t const blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); uint32_t const threadDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - uint32_t const blockIdx(alpaka::idx::getIdx(acc)[0u]); uint32_t const threadIdx(alpaka::idx::getIdx(acc)[0u]); auto* const psum(alpaka::block::shared::dyn::getMem(acc)); diff --git a/src/alpaka/CUDACore/CMSUnrollLoop.h b/src/alpaka/CUDACore/CMSUnrollLoop.h new file mode 100644 index 000000000..a46df28a2 --- /dev/null +++ b/src/alpaka/CUDACore/CMSUnrollLoop.h @@ -0,0 +1,51 @@ +#ifndef FWCore_Utilities_interface_CMSUnrollLoop_h +#define FWCore_Utilities_interface_CMSUnrollLoop_h + +// convert the macro argument to a null-terminated quoted string +#define STRINGIFY_(ARG) #ARG +#define STRINGIFY(ARG) STRINGIFY_(ARG) + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// CUDA or HIP device compiler + +#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(unroll)) +#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N)) +#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1)) + +#define CMS_DEVICE_UNROLL_LOOP _Pragma(STRINGIFY(unroll)) +#define CMS_DEVICE_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N)) +#define CMS_DEVICE_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1)) + +#else // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__) + +// any host compiler +#define CMS_DEVICE_UNROLL_LOOP +#define CMS_DEVICE_UNROLL_LOOP_COUNT(N) +#define CMS_DEVICE_UNROLL_LOOP_DISABLE + +#if defined(__clang__) +// clang host compiler + +#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(clang loop unroll(enable))) +#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(clang loop unroll_count(N))) +#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(clang loop unroll(disable))) + +#elif defined(__GNUC__) +// GCC host compiler + +#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(GCC ivdep)) +#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(GCC unroll N)) _Pragma(STRINGIFY(GCC ivdep)) +#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(GCC unroll 1)) + +#else +// unsupported or unknown compiler + +#define CMS_UNROLL_LOOP +#define CMS_UNROLL_LOOP_COUNT(N) +#define CMS_UNROLL_LOOP_DISABLE + +#endif // defined(__clang__) || defined(__GNUC__) || ... + +#endif // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__) + +#endif // FWCore_Utilities_interface_CMSUnrollLoop_h diff --git a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc index 3eefc81a3..05e146789 100644 --- a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc +++ b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc @@ -11,10 +11,7 @@ struct update { template ALPAKA_FN_ACC void operator()( const T_Acc &acc, cms::alpakatools::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, n, [&](uint32_t i) { auto m = i % 11; m = m % 6 + 1; // max 6, no 0 auto c = dc->add(acc, m); @@ -22,7 +19,7 @@ struct update { ind[c.m] = c.n; for (uint32_t j = c.n; j < c.n + m; ++j) cont[j] = i; - } + }); } }; @@ -45,10 +42,7 @@ struct verify { uint32_t const *ind, uint32_t const *cont, uint32_t n) const { - const auto &[firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, n, [&](uint32_t i) { assert(0 == ind[0]); assert(dc->get().m == n); assert(ind[n] == dc->get().n); @@ -58,7 +52,7 @@ struct verify { assert(k < n); for (; ib < ie; ++ib) assert(cont[ib] == k); - } + }); } }; diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index a86d59e76..537b3d87b 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -23,36 +23,16 @@ struct mykernel { auto&& hist = alpaka::block::shared::st::allocVar(acc); auto&& ws = alpaka::block::shared::st::allocVar(acc); - const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc); - // set off zero - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < Hist::totbins(); - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { - hist.off[j] = 0; - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::block::sync::syncBlockThreads(acc); // set bins zero - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < Hist::capacity(); - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::totbins()); ++j) { - hist.bins[j] = 0; - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::totbins(), [&](uint32_t j) { hist.bins[j] = 0; }); alpaka::block::sync::syncBlockThreads(acc); // count - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist.count(acc, v[j]); - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, N, [&](uint32_t j) { hist.count(acc, v[j]); }); alpaka::block::sync::syncBlockThreads(acc); assert(0 == hist.size()); @@ -62,83 +42,62 @@ struct mykernel { hist.finalize(acc, ws); alpaka::block::sync::syncBlockThreads(acc); - if (threadIdxLocal == 0) { - printf("hist.size() = %u.\n", hist.size()); - } assert(N == hist.size()); // verify - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < Hist::nbins(); - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, Hist::nbins()); ++j) { - assert(hist.off[j] <= hist.off[j + 1]); - } - } + cms::alpakatools::for_each_element_1D_block_stride( + acc, Hist::nbins(), [&](uint32_t j) { assert(hist.off[j] <= hist.off[j + 1]); }); alpaka::block::sync::syncBlockThreads(acc); - if (threadIdxLocal < 32) { - ws[threadIdxLocal] = 0; // used by prefix scan... - } + cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, 32, [&](uint32_t i) { + ws[i] = 0; // used by prefix scan... + }); alpaka::block::sync::syncBlockThreads(acc); // fill - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < N; - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, N); ++j) { - hist.fill(acc, v[j], j); - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, N, [&](uint32_t j) { hist.fill(acc, v[j], j); }); alpaka::block::sync::syncBlockThreads(acc); assert(0 == hist.off[0]); assert(N == hist.size()); // bin - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < hist.size() - 1; - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t j = threadIdx; j < std::min(endElementIdx, hist.size() - 1); ++j) { - auto p = hist.begin() + j; - assert((*p) < N); - auto k1 = Hist::bin(v[*p]); - auto k2 = Hist::bin(v[*(p + 1)]); - assert(k2 >= k1); - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size() - 1, [&](uint32_t j) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + }); // forEachInWindow - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < hist.size(); - threadIdx += blockDimension, endElementIdx += blockDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, hist.size()); ++i) { - auto p = hist.begin() + i; - auto j = *p; - auto b0 = Hist::bin(v[j]); - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); - int rtot = hist.size(b0); - assert(tot == rtot); - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); - int bp = Hist::bin(vp); - int bm = Hist::bin(vm); - rtot = hist.end(bp) - hist.begin(bm); - assert(tot == rtot); - } - } + cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t i) { + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); + }); } }; diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 00e774a08..a9a16effe 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -23,25 +23,20 @@ struct countMultiLocal { TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { - auto&& local = alpaka::block::shared::st::allocVar(acc); - if (threadIdxLocal == 0) { - local.zero(); - } - alpaka::block::sync::syncBlockThreads(acc); - local.countDirect(acc, 2 + i % 4); - alpaka::block::sync::syncBlockThreads(acc); - if (threadIdxLocal == 0) { - assoc->add(acc, local); - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, n, [&](uint32_t i) { + auto&& local = alpaka::block::shared::st::allocVar(acc); + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0); + if (oncePerSharedMemoryAccess) { + local.zero(); } - } + alpaka::block::sync::syncBlockThreads(acc); + local.countDirect(acc, 2 + i % 4); + alpaka::block::sync::syncBlockThreads(acc); + if (oncePerSharedMemoryAccess) { + assoc->add(acc, local); + } + }); } }; @@ -51,32 +46,15 @@ struct countMulti { TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, n); ++i) { - assoc->countDirect(acc, 2 + i % 4); - } - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, n, [&](uint32_t i) { assoc->countDirect(acc, 2 + i % 4); }); } }; struct verifyMulti { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { - const uint32_t maxNumberOfElements = Multiplicity::totbins(); - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < maxNumberOfElements; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - assert(m1->off[i] == m2->off[i]); - } - } + cms::alpakatools::for_each_element_1D_grid_stride( + acc, Multiplicity::totbins(), [&](uint32_t i) { assert(m1->off[i] == m2->off[i]); }); } }; @@ -86,25 +64,17 @@ struct count { TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - const uint32_t maxNumberOfElements = 4 * n; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < maxNumberOfElements; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - auto k = i / 4; - auto j = i - 4 * k; - assert(j < 4); - if (k >= n) { - return; - } - if (tk[k][j] < MaxElem) { - assoc->countDirect(acc, tk[k][j]); - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, 4 * n, [&](uint32_t i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; } - } + if (tk[k][j] < MaxElem) { + assoc->countDirect(acc, tk[k][j]); + } + }); } }; @@ -114,25 +84,17 @@ struct fill { TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - const uint32_t maxNumberOfElements = 4 * n; - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(maxNumberOfElements)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; - threadIdx < maxNumberOfElements; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { - auto k = i / 4; - auto j = i - 4 * k; - assert(j < 4); - if (k >= n) { - return; - } - if (tk[k][j] < MaxElem) { - assoc->fillDirect(acc, tk[k][j], k); - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, 4 * n, [&](uint32_t i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; } - } + if (tk[k][j] < MaxElem) { + assoc->fillDirect(acc, tk[k][j], k); + } + }); } }; @@ -150,16 +112,10 @@ struct fillBulk { TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < n; - threadIdx += gridDimension, endElementIdx += gridDimension) { - for (uint32_t k = threadIdx; k < std::min(endElementIdx, n); ++k) { - auto m = tk[k][3] < MaxElem ? 4 : 3; - assoc->bulkFill(acc, *apc, &tk[k][0], m); - } - } + cms::alpakatools::for_each_element_1D_grid_stride(acc, n, [&](uint32_t k) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(acc, *apc, &tk[k][0], m); + }); } }; diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 59e63db68..0f8ce64e7 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -4,7 +4,6 @@ #include "AlpakaCore/alpakaWorkDivHelper.h" #include "AlpakaCore/prefixScan.h" -using namespace cms::alpakatools; using namespace ALPAKA_ACCELERATOR_NAMESPACE; template @@ -27,27 +26,27 @@ struct testPrefixScan { auto&& c = alpaka::block::shared::st::allocVar(acc); auto&& co = alpaka::block::shared::st::allocVar(acc); - uint32_t const blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::idx::getIdx(acc)[0u]); + cms::alpakatools::for_each_element_1D_block_stride(acc, size, [&](uint32_t i) { c[i] = 1; }); - auto first = blockThreadIdx; - for (auto i = first; i < size; i += blockDimension) - c[i] = 1; alpaka::block::sync::syncBlockThreads(acc); - blockPrefixScan(acc, c, co, size, ws); - blockPrefixScan(acc, c, size, ws); + cms::alpakatools::blockPrefixScan(acc, c, co, size, ws); + cms::alpakatools::blockPrefixScan(acc, c, size, ws); assert(1 == c[0]); assert(1 == co[0]); - for (auto i = first + 1; i < size; i += blockDimension) { + + cms::alpakatools::for_each_element_1D_block_stride(acc, size, 1u, [&](uint32_t i) { assert(c[i] == c[i - 1] + 1); assert(c[i] == i + 1); assert(c[i] = co[i]); - } + }); } }; +/* + * NB: GPU-only, so do not care about elements here. + */ template struct testWarpPrefixScan { template @@ -83,29 +82,24 @@ struct testWarpPrefixScan { struct init { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t* v, uint32_t val, uint32_t n) const { - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - - for (uint32_t index = firstElementIdxGlobal[0u]; index < endElementIdxGlobal[0u]; ++index) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, n, [&](uint32_t index) { v[index] = val; if (index == 0) printf("init\n"); - } + }); } }; struct verify { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t const* v, uint32_t n) const { - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(n)); - - for (uint32_t index = firstElementIdxGlobal[0u]; index < endElementIdxGlobal[0u]; ++index) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, n, [&](uint32_t index) { assert(v[index] == index + 1); + if (index == 0) printf("verify\n"); - } + }); } }; @@ -116,120 +110,100 @@ int main() { Queue queue(device); - Vec1 elementsPerThread(Vec1::all(1)); - Vec1 threadsPerBlock(Vec1::all(32)); - Vec1 blocksPerGrid(Vec1::all(1)); -#if defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED || ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED || ALPAKA_ACC_CPU_BT_OMP4_ENABLED - // on the GPU, run with 512 threads in parallel per block, each looking at a single element - // on the CPU, run serially with a single thread per block, over 512 elements - std::swap(threadsPerBlock, elementsPerThread); -#endif -#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - threadsPerBlock = Vec1::all(1); -#endif - - const WorkDiv1 workDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); - std::cout << "blocks per grid: " << blocksPerGrid << ", threads per block: " << threadsPerBlock - << ", elements per thread: " << elementsPerThread << std::endl; - + // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY) #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED std::cout << "warp level" << std::endl; - alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, testWarpPrefixScan(), 32)); - alpaka::wait::wait(queue); - alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, testWarpPrefixScan(), 16)); - alpaka::wait::wait(queue); + const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(32)); + const Vec1 blocksPerGrid1(Vec1::all(1)); + const WorkDiv1& workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); + + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDivWarp, testWarpPrefixScan(), 16)); - alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDiv, testWarpPrefixScan(), 5)); - alpaka::wait::wait(queue); + alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDivWarp, testWarpPrefixScan(), 5)); #endif + + // PORTABLE BLOCK PREFIXSCAN std::cout << "block level" << std::endl; - int bs = 1; -#if not defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - for (bs = 32; bs <= 1024; bs += 32) { -#endif - std::cout << "bs " << bs << std::endl; + // Running kernel with 1 block, and bs threads per block or elements per thread. + // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version. + for (int bs = 32; bs <= 1024; bs += 32) { + const Vec1 threadsPerBlockOrElementsPerThread2(Vec1::all(bs)); + const Vec1 blocksPerGrid2(Vec1::all(1)); + const WorkDiv1& workDivSingleBlock = + cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2); + + std::cout << "blocks per grid: " << blocksPerGrid2 + << ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2 << std::endl; + + // Problem size for (int j = 1; j <= 1024; ++j) { - // running kernel with 1 block, bs threads per block, 1 element per thread alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - WorkDiv1{Vec1::all(1), Vec1::all(bs), Vec1::all(1)}, testPrefixScan(), j)); - alpaka::wait::wait(queue); + alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel( - WorkDiv1{Vec1::all(1), Vec1::all(bs), Vec1::all(1)}, testPrefixScan(), j)); - alpaka::wait::wait(queue); + alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); } -#if not defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED } -#endif - - alpaka::wait::wait(queue); + // PORTABLE MULTI-BLOCK PREFIXSCAN int num_items = 200; for (int ksize = 1; ksize < 4; ++ksize) { - // test multiblock std::cout << "multiblock" << std::endl; - num_items *= 8; - uint32_t* d_in; - uint32_t* d_out1; - uint32_t* d_out2; + num_items *= 10; - auto input_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items * sizeof(uint32_t))); + auto input_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); uint32_t* input_d = alpaka::mem::view::getPtrNative(input_dBuf); - auto output1_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items * sizeof(uint32_t))); + auto output1_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); uint32_t* output1_d = alpaka::mem::view::getPtrNative(output1_dBuf); -#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - auto nthreads = 1; -#else - auto nthreads = 256; -#endif - auto nblocks = (num_items + nthreads - 1) / nthreads; + const auto nThreadsInit = 256; // NB: 1024 would be better + // Just kept here to be identical to CUDA test + const Vec1 threadsPerBlockOrElementsPerThread3(Vec1::all(nThreadsInit)); + const auto nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + const Vec1 blocksPerGrid3(Vec1::all(nBlocksInit)); + const WorkDiv1& workDivMultiBlockInit = + cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3); alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(1)}, init(), input_d, 1, num_items)); - alpaka::wait::wait(queue); - -#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - nthreads = 1; - auto nelements = 768; - nblocks = (num_items + nelements - 1) / nelements; -#else - nthreads = 768; - auto nelements = 1; - nblocks = (num_items + nthreads - 1) / nthreads; -#endif + queue, alpaka::kernel::createTaskKernel(workDivMultiBlockInit, init(), input_d, 1, num_items)); - std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; + const auto nThreads = 1024; + const Vec1 threadsPerBlockOrElementsPerThread4(Vec1::all(nThreads)); + const auto nBlocks = (num_items + nThreads - 1) / nThreads; + const Vec1 blocksPerGrid4(Vec1::all(nBlocks)); + const WorkDiv1& workDivMultiBlock = + cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4); + + std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(nelements)}, - multiBlockPrefixScanFirstStep(), + alpaka::kernel::createTaskKernel(workDivMultiBlock, + cms::alpakatools::multiBlockPrefixScanFirstStep(), input_d, output1_d, num_items)); - alpaka::wait::wait(queue); + + const Vec1 blocksPerGridSecondStep(Vec1::all(1)); + const WorkDiv1& workDivMultiBlockSecondStep = + cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); alpaka::queue::enqueue( queue, - alpaka::kernel::createTaskKernel(WorkDiv1{Vec1::all(1), Vec1::all(nthreads), Vec1::all(nelements)}, - multiBlockPrefixScanSecondStep(), + alpaka::kernel::createTaskKernel(workDivMultiBlockSecondStep, + cms::alpakatools::multiBlockPrefixScanSecondStep(), input_d, output1_d, num_items, - nblocks)); - alpaka::wait::wait(queue); + nBlocks)); - alpaka::queue::enqueue( - queue, - alpaka::kernel::createTaskKernel( - WorkDiv1{Vec1::all(nblocks), Vec1::all(nthreads), Vec1::all(nelements)}, verify(), output1_d, num_items)); - alpaka::wait::wait(queue); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivMultiBlock, verify(), output1_d, num_items)); + + alpaka::wait::wait(queue); // input_dBuf and output1_dBuf end of scope + } // ksize - } // ksize return 0; } diff --git a/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h b/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h index 67ebf9781..d008449d3 100644 --- a/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpakatest/AlpakaCore/alpakaWorkDivHelper.h @@ -29,33 +29,268 @@ namespace cms { } /* - * Computes the range of the element(s) global index(es) in grid. + * 1D helper to only access 1 element per block + * (should obviously only be needed for debug / printout). */ - template - ALPAKA_FN_ACC std::pair, Vec> element_global_index_range_truncated(const T_Acc& acc, - const Vec& maxNumberOfElements) { - Vec firstElementIdxGlobalVec = Vec::zeros(); - Vec endElementIdxGlobalVec = Vec::zeros(); + template + ALPAKA_FN_ACC bool once_per_block_1D(const T_Acc& acc, uint32_t i) { + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + return (i % blockDimension == 0); + } + + /* + * Computes the range of the elements indexes, local to the block. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template > + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_block(const T_Acc& acc, + const Vec& elementIdxShift) { + Vec firstElementIdxVec = Vec::zeros(); + Vec endElementIdxUncutVec = Vec::zeros(); + // Loop on all grid dimensions. for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { - // Global thread index in grid (along dimension dimIndex). - const uint32_t threadIdxGlobal(alpaka::idx::getIdx(acc)[dimIndex]); + // Take into account the thread index in block. + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[dimIndex]); const uint32_t threadDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); - // Global element index in grid (along dimension dimIndex). + // Compute the elements indexes in block. // Obviously relevant for CPU only. - // For GPU, threadDimension = 1, and firstElementIdxGlobal = endElementIdxGlobal = threadIndexGlobal. - const uint32_t firstElementIdxGlobal = threadIdxGlobal * threadDimension; - const uint32_t endElementIdxGlobalUncut = firstElementIdxGlobal + threadDimension; - const uint32_t endElementIdxGlobal = std::min(endElementIdxGlobalUncut, maxNumberOfElements[dimIndex]); + // For GPU, threadDimension = 1, and elementIdx = firstElementIdx = threadIdx + elementIdxShift. + const uint32_t firstElementIdxLocal = threadIdxLocal * threadDimension; + const uint32_t firstElementIdx = firstElementIdxLocal + elementIdxShift[dimIndex]; // Add the shift! + const uint32_t endElementIdxUncut = firstElementIdx + threadDimension; + + firstElementIdxVec[dimIndex] = firstElementIdx; + endElementIdxUncutVec[dimIndex] = endElementIdxUncut; + } + + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdxVec, endElementIdxUncutVec}; + } + + /* + * Computes the range of the elements indexes, local to the block. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_block_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements, const Vec& elementIdxShift) { + // Check dimension + static_assert(alpaka::dim::Dim::value == T_Dim::value, + "Accelerator and maxNumberOfElements need to have same dimension."); + auto&& [firstElementIdxLocalVec, endElementIdxLocalVec] = element_index_range_in_block(acc, elementIdxShift); + + // Truncate + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + endElementIdxLocalVec[dimIndex] = std::min(endElementIdxLocalVec[dimIndex], maxNumberOfElements[dimIndex]); + } + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocalVec, endElementIdxLocalVec}; + } + + /* + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template > + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid(const T_Acc& acc, + Vec& elementIdxShift) { + // Loop on all grid dimensions. + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + // Take into account the block index in grid. + const uint32_t blockIdxInGrid(alpaka::idx::getIdx(acc)[dimIndex]); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift[dimIndex] += blockIdxInGrid * blockDimension; + } + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift); + } + + /* + * Computes the range of the elements indexes in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements, Vec& elementIdxShift) { + // Check dimension + static_assert(alpaka::dim::Dim::value == T_Dim::value, + "Accelerator and maxNumberOfElements need to have same dimension."); + auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_index_range_in_grid(acc, elementIdxShift); - firstElementIdxGlobalVec[dimIndex] = firstElementIdxGlobal; - endElementIdxGlobalVec[dimIndex] = endElementIdxGlobal; + // Truncate + for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) { + endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]); } + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. return {firstElementIdxGlobalVec, endElementIdxGlobalVec}; } + /* + * Computes the range of the element(s) index(es) in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair, Vec> element_index_range_in_grid_truncated( + const T_Acc& acc, const Vec& maxNumberOfElements) { + Vec elementIdxShift = Vec::zeros(); + return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift); + } + + /********************************************* + * 1D HELPERS, LOOP ON ALL CPU ELEMENTS + ********************************************/ + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func& func) { + const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_index_range_in_block_truncated( + acc, Vec1::all(maxNumberOfElements), Vec1::all(elementIdxShift)); + + for (uint32_t elementIdx = firstElementIdx[0u]; elementIdx < endElementIdx[0u]; ++elementIdx) { + func(elementIdx); + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func& func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are expressed in GRID 'frame-of-reference'. + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc, + const uint32_t maxNumberOfElements, + uint32_t elementIdxShift, + const Func& func) { + // Take into account the block index in grid to compute the element indices. + const uint32_t blockIdxInGrid(alpaka::idx::getIdx(acc)[0u]); + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + elementIdxShift += blockIdxInGrid * blockDimension; + + for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func& func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, maxNumberOfElements, elementIdxShift, func); + } + + /****************************************************************************** + * 1D HELPERS, LOOP ON ALL CPU ELEMENTS, AND ELEMENT/THREAD STRIDED ACCESS + ******************************************************************************/ + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by BLOCK size. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func& func) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_block(acc, Vec1::all(elementIdxShift)); + + // Stride = block size. + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Strided access. + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func& func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_1D_block_stride(acc, maxNumberOfElements, elementIdxShift, func); + } + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by GRID size. + * Indexes are local to the GRID. + */ + template + ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const uint32_t elementIdxShift, + const Func& func) { + Vec1 elementIdxShiftVec = Vec1::all(elementIdxShift); + + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_grid(acc, elementIdxShiftVec); + + // Stride = grid size. + const uint32_t gridDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Strided access. + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc, + const uint32_t maxNumberOfElements, + const Func& func) { + const uint32_t elementIdxShift = 0; + cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func); + } + } // namespace alpakatools } // namespace cms diff --git a/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc b/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc index 1e62b169e..3bf3b2589 100644 --- a/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc +++ b/src/alpakatest/plugin-Test1/alpaka/alpakaAlgo1.cc @@ -13,12 +13,8 @@ namespace { unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { - c[i] = a[i] + b[i]; - } + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid( + acc, numElements, [&](uint32_t i) { c[i] = a[i] + b[i]; }); } }; @@ -32,7 +28,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); + cms::alpakatools::element_index_range_in_grid_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -52,7 +48,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); + cms::alpakatools::element_index_range_in_grid_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -75,16 +71,13 @@ namespace { unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t row = firstElementIdxGlobal[0u]; row < endElementIdxGlobal[0u]; ++row) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, numElements, [&](uint32_t row) { T_Data tmp = 0; for (unsigned int i = 0; i < numElements; ++i) { tmp += a[row * numElements + i] * b[i]; } c[row] = tmp; - } + }); } }; @@ -100,15 +93,12 @@ namespace { ALPAKA_FN_ACC void operator()(const T_Acc& acc, const T_Data* result, unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, numElements, [&](uint32_t i) { // theoreticalResult = i+i^2 = i*(i+1) if (result[i] != i * (i + 1)) { printf("Wrong vectorAdd results, i = %u, c[i] = %f.\n", i, result[i]); } - } + }); } }; diff --git a/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc b/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc index 9e8cef29c..db1a22398 100644 --- a/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc +++ b/src/alpakatest/plugin-Test2/alpaka/alpakaAlgo2.cc @@ -13,12 +13,8 @@ namespace { unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { - c[i] = a[i] + b[i]; - } + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid( + acc, numElements, [&](uint32_t i) { c[i] = a[i] + b[i]; }); } }; @@ -32,7 +28,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); + cms::alpakatools::element_index_range_in_grid_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -52,7 +48,7 @@ namespace { // Global element index in 2D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec2::all(numElements)); + cms::alpakatools::element_index_range_in_grid_truncated(acc, Vec2::all(numElements)); for (uint32_t col = firstElementIdxGlobal[0u]; col < endElementIdxGlobal[0u]; ++col) { for (uint32_t row = firstElementIdxGlobal[1u]; row < endElementIdxGlobal[1u]; ++row) { @@ -75,16 +71,13 @@ namespace { unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t row = firstElementIdxGlobal[0u]; row < endElementIdxGlobal[0u]; ++row) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, numElements, [&](uint32_t row) { T_Data tmp = 0; for (unsigned int i = 0; i < numElements; ++i) { tmp += a[row * numElements + i] * b[i]; } c[row] = tmp; - } + }); } }; @@ -100,15 +93,12 @@ namespace { ALPAKA_FN_ACC void operator()(const T_Acc& acc, const T_Data* result, unsigned int numElements) const { // Global element index in 1D grid. // NB: On GPU, i = threadIndexGlobal = firstElementIdxGlobal = endElementIdxGlobal. - const auto& [firstElementIdxGlobal, endElementIdxGlobal] = - cms::alpakatools::element_global_index_range_truncated(acc, Vec1::all(numElements)); - - for (uint32_t i = firstElementIdxGlobal[0u]; i < endElementIdxGlobal[0u]; ++i) { + cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, numElements, [&](uint32_t i) { // theoreticalResult = i+i^2 = i*(i+1) if (result[i] != i * (i + 1)) { printf("Wrong vectorAdd results, i = %u, c[i] = %f.\n", i, result[i]); } - } + }); } };