From b70e6e229457094b24d7f09961ee5855070cb80e Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 3 Apr 2024 23:14:12 +0200 Subject: [PATCH 1/3] Replace legacy loop names with the more explicit ones In the cms::alpakatools namespace - replace elements_with_stride(...) with uniform_elements(...) - replace blocks_with_stride(...) with uniform_groups(...) - replace elements_in_block(...) with uniform_group_elements(...) --- .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 18 +++++++-------- .../AmplitudeComputationCommonKernels.h | 14 ++++++------ .../alpaka/AmplitudeComputationKernels.dev.cc | 2 +- .../plugins/alpaka/TimeComputationKernels.h | 22 +++++++++---------- .../plugins/alpaka/PFClusterECLCC.h | 6 ++--- .../alpaka/PFClusterSoAProducerKernel.dev.cc | 6 ++--- .../alpaka/PFRecHitProducerKernel.dev.cc | 4 ++-- 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc index c56ecc3cf1234..7f73394622712 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc @@ -30,7 +30,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, view.metadata().size())) { + for (int32_t i : uniform_elements(acc, view.metadata().size())) { view[i] = {xvalue, 0., 0., i, flags, matrix * i}; } } @@ -52,7 +52,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, view.metadata().size())) { + for (int32_t i : uniform_elements(acc, view.metadata().size())) { view[i] = {xvalue, 0., 0., i, matrix * i}; } } @@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, view.metadata().size())) { + for (int32_t i : uniform_elements(acc, view.metadata().size())) { view[i] = {xvalue, 0., 0., i, matrix * i}; } } @@ -174,7 +174,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, output.metadata().size())) { + for (int32_t i : uniform_elements(acc, output.metadata().size())) { double x = input[i].x(); if (i < esData.size()) { x += esData.val(i) + esData.val2(i); @@ -200,14 +200,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, output.metadata().size())) { + for (int32_t i : uniform_elements(acc, output.metadata().size())) { double x = input[i].x(); if (i < esData.size()) { x += esData.val(i) + esData.val2(i); } output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; } - for (int32_t i : elements_with_stride(acc, output2.metadata().size())) { + for (int32_t i : uniform_elements(acc, output2.metadata().size())) { double x2 = input2[i].x2(); if (i < esData.size()) { x2 += esData.val(i) + esData.val2(i); @@ -236,7 +236,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // make a strided loop over the kernel grid, covering up to "size" elements - for (int32_t i : elements_with_stride(acc, output.metadata().size())) { + for (int32_t i : uniform_elements(acc, output.metadata().size())) { double x = input[i].x(); if (i < esData.size()) { x += esData.val(i) + esData.val2(i); @@ -245,14 +245,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; } - for (int32_t i : elements_with_stride(acc, output2.metadata().size())) { + for (int32_t i : uniform_elements(acc, output2.metadata().size())) { double x2 = input2[i].x2(); if (i < esData.size()) { x2 += esData.val(i) + esData.val2(i); } output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()}; } - for (int32_t i : elements_with_stride(acc, output3.metadata().size())) { + for (int32_t i : uniform_elements(acc, output3.metadata().size())) { double x3 = input3[i].x3(); if (i < esData.size()) { x3 += esData.val(i) + esData.val2(i); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h index e590ce0d8b795..1f946addb4e34 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h @@ -66,8 +66,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto* shr_hasSwitchToGain0_tmp = shr_isSaturated + elemsPerBlock; auto* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + elemsPerBlock; - for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) { - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { // set the output collection size scalars if (idx.global == 0) { uncalibRecHitsEB.size() = nchannelsEB; @@ -91,7 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { alpaka::syncBlockThreads(acc); - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const sample = idx.local % nsamples; // non-divergent branch (except for the last 4 threads) @@ -118,7 +118,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { alpaka::syncBlockThreads(acc); - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const sample = idx.local % nsamples; if (sample < 2) { @@ -141,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { alpaka::syncBlockThreads(acc); - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ch = idx.global / nsamples; auto const sample = idx.local % nsamples; @@ -164,7 +164,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { // check if we can remove it alpaka::syncBlockThreads(acc); - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ch = idx.global / nsamples; auto const sample = idx.local % nsamples; @@ -355,7 +355,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto const elemsPerBlockY = alpaka::getWorkDiv(acc)[0u]; Vec2D const size_2d = {elemsPerBlockY, blockDimX * elemsPerBlockX}; // {y, x} coordinates - for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size_2d)) { + for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size_2d)) { auto const ch = ndindex[1] / nsamples; auto const tx = ndindex[1] % nsamples; auto const ty = ndindex[0]; diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc index fcf9e5de16f40..552761653bb23 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc @@ -91,7 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { DataType* shrmem = alpaka::getDynSharedMem(acc); // channel - for (auto idx : cms::alpakatools::elements_with_stride(acc, nchannels)) { + for (auto idx : cms::alpakatools::uniform_elements(acc, nchannels)) { if (static_cast(acState[idx]) == MinimizationState::Precomputed) continue; diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h index 667e4d4687e51..05e01954215af 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h @@ -53,7 +53,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto* s_sumA = s_sum1 + elemsPerBlock; auto* s_sumAA = s_sumA + elemsPerBlock; - for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) { // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution auto tx = nchannels * nsamples - 1 - txforward; auto const ch = tx / nsamples; @@ -163,8 +163,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock; auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock; - for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) { - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ch = idx.global / nthreads_per_channel; auto const ltx = idx.global % nthreads_per_channel; @@ -396,7 +396,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { bool oddElements = nthreads_per_channel % 2; CMS_UNROLL_LOOP while (iter >= 1) { - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ltx = idx.global % nthreads_per_channel; if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) { @@ -411,7 +411,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; } - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ltx = idx.global % nthreads_per_channel; // get precomputedflags for this element from shared memory @@ -459,7 +459,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { oddElements = nthreads_per_channel % 2; CMS_UNROLL_LOOP while (iter >= 1) { - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ltx = idx.global % nthreads_per_channel; if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) { @@ -475,7 +475,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; } - for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) { auto const ltx = idx.global % nthreads_per_channel; // load from shared memory the 0th guy (will contain accumulated values) @@ -559,7 +559,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto* shr_sumAf = alpaka::getDynSharedMem(acc); auto* shr_sumff = shr_sumAf + elemsPerBlock; - for (auto gtxforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) { // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution auto gtx = nchannels * nsamples - 1 - gtxforward; auto const ch = gtx / nsamples; @@ -744,7 +744,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; - for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannelsEB * nsamples)) { + for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) { auto const elemIdx = gtx % elemsPerBlock; auto const sample = elemIdx % nsamples; auto const ch = gtx / nsamples; @@ -800,7 +800,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto* shrSampleValues = alpaka::getDynSharedMem(acc); auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock; - for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) { // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution auto tx = nchannels * nsamples - 1 - txforward; auto const ch = tx / nsamples; @@ -988,7 +988,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { auto const offsetForInputs = nchannelsEB; auto const offsetForHashes = conditionsDev.offsetEE(); - for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannels)) { + for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) { const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx; auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id(); auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data(); diff --git a/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h b/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h index b1fc0a35f4396..abf63c01e9531 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h +++ b/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterECLCC.h @@ -85,7 +85,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const { const int nRH = pfRecHits.size(); - for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) { + for (int v : cms::alpakatools::uniform_elements(acc, nRH)) { const int beg = pfClusteringEdgeVars[v].pfrh_edgeIdx(); const int end = pfClusteringEdgeVars[v + 1].pfrh_edgeIdx(); int m = v; @@ -110,7 +110,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const { const int nRH = pfRecHits.size(); - for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) { + for (int v : cms::alpakatools::uniform_elements(acc, nRH)) { const int vstat = pfClusteringVars[v].pfrh_topoId(); if (v != vstat) { const int beg = pfClusteringEdgeVars[v].pfrh_edgeIdx(); @@ -155,7 +155,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const { const int nRH = pfRecHits.size(); - for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) { + for (int v : cms::alpakatools::uniform_elements(acc, nRH)) { int next, vstat = pfClusteringVars[v].pfrh_topoId(); const int old = vstat; while (vstat > (next = pfClusteringVars[vstat].pfrh_topoId())) { diff --git a/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.dev.cc b/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.dev.cc index 80ab1329d0730..53095381d951b 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.dev.cc +++ b/RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFClusterSoAProducerKernel.dev.cc @@ -1098,7 +1098,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { clusterView.size() = nRH; } - for (auto i : elements_with_stride(acc, nRH)) { + for (auto i : uniform_elements(acc, nRH)) { // Initialize arrays pfClusteringVars[i].pfrh_isSeed() = 0; pfClusteringVars[i].rhCount() = 0; @@ -1176,7 +1176,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { pfClusteringVars.nEdges() = nRH * 8; pfClusteringEdgeVars[nRH].pfrh_edgeIdx() = nRH * 8; } - for (uint32_t i : cms::alpakatools::elements_with_stride(acc, nRH)) { + for (uint32_t i : cms::alpakatools::uniform_elements(acc, nRH)) { pfClusteringEdgeVars[i].pfrh_edgeIdx() = i * 8; pfClusteringVars[i].pfrh_topoId() = 0; for (int j = 0; j < 8; j++) { // checking if neighbours exist and assigning neighbours as edges @@ -1323,7 +1323,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { reco::PFRecHitFractionDeviceCollection::View fracView) const { const int nRH = pfRecHits.size(); - for (auto index : elements_with_stride_nd(acc, {nRH, nRH})) { + for (auto index : uniform_elements_nd(acc, {nRH, nRH})) { const int i = index[0u]; // i is a seed index const int j = index[1u]; // j is NOT a seed int topoId = pfClusteringVars[i].pfrh_topoId(); diff --git a/RecoParticleFlow/PFRecHitProducer/plugins/alpaka/PFRecHitProducerKernel.dev.cc b/RecoParticleFlow/PFRecHitProducer/plugins/alpaka/PFRecHitProducerKernel.dev.cc index ef18ebc5ecc93..e0bdbab4e2b48 100644 --- a/RecoParticleFlow/PFRecHitProducer/plugins/alpaka/PFRecHitProducerKernel.dev.cc +++ b/RecoParticleFlow/PFRecHitProducer/plugins/alpaka/PFRecHitProducerKernel.dev.cc @@ -22,7 +22,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t* __restrict__ denseId2pfRecHit, uint32_t* __restrict__ num_pfRecHits) const { // Strided loop over CaloRecHits - for (int32_t i : cms::alpakatools::elements_with_stride(acc, recHits.metadata().size())) { + for (int32_t i : cms::alpakatools::uniform_elements(acc, recHits.metadata().size())) { // Check energy thresholds/quality cuts (specialised for HCAL/ECAL) if (!applyCuts(recHits[i], params, topology)) continue; @@ -142,7 +142,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { pfRecHits.size() = *num_pfRecHits; // Assign position information and associate neighbours - for (int32_t i : cms::alpakatools::elements_with_stride(acc, *num_pfRecHits)) { + for (int32_t i : cms::alpakatools::uniform_elements(acc, *num_pfRecHits)) { const uint32_t denseId = CAL::detId2denseId(pfRecHits.detId(i)); pfRecHits.x(i) = topology.positionX(denseId); From f7c445c99644435d7c8717e194f40621c11b523b Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 3 Apr 2024 23:17:44 +0200 Subject: [PATCH 2/3] Drop legacy loop names --- .../AlpakaInterface/interface/workdivision.h | 67 ------------------- 1 file changed, 67 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index fe02f9646605a..3475d00e91259 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -330,23 +330,6 @@ namespace cms::alpakatools { return uniform_elements_along::value - 3>(acc, static_cast(args)...); } - /* elements_with_stride - * - * `elements_with_stride(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element - * indices from `first` (inclusive) to `extent` (exlusive). - * If `first` is not specified, it defaults to 0. - * If `extent` is not specified, it defaults to the kernel grid size. - * - * `elements_with_stride(acc, ...)` is a legacy name for `uniform_elements(acc, ...)`. - */ - - template and alpaka::Dim::value == 1>> - ALPAKA_FN_ACC inline auto elements_with_stride(TAcc const& acc, TArgs... args) { - return uniform_elements_along(acc, static_cast(args)...); - } - /* uniform_elements_nd * * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices @@ -553,24 +536,6 @@ namespace cms::alpakatools { const Vec extent_; }; - /* elements_with_stride_nd - * - * `elements_with_stride_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices - * required to cover the given problem size, indicated by `extent`. - * - * `elements_with_stride_nd(acc, ...)` is a legacy name for `uniform_elements_nd(acc, ...)`. - */ - - template and (alpaka::Dim::value > 0)>> - ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc) { - return uniform_elements_nd(acc); - } - - template and (alpaka::Dim::value > 0)>> - ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc, alpaka::Vec, Idx> extent) { - return uniform_elements_nd(acc, extent); - } - /* uniform_groups_along * * `uniform_groups_along(acc, elements)` returns a one-dimensional iteratable range than spans the group indices @@ -769,22 +734,6 @@ namespace cms::alpakatools { return uniform_groups_along::value - 3>(acc, static_cast(args)...); } - /* blocks_with_stride - * - * `blocks_with_stride(acc, elements)` returns a one-dimensional iteratable range than spans the group indices - * required to cover the given problem size, in units of the block size. `elements` indicates the total number of - * elements, across all groups; if not specified, it defaults to the kernel grid size. - * - * `blocks_with_stride(acc, ...)` is a legacy name for `uniform_groups(acc, ...)`. - */ - - template and alpaka::Dim::value == 1>> - ALPAKA_FN_ACC inline auto blocks_with_stride(TAcc const& acc, TArgs... args) { - return uniform_groups_along(acc, static_cast(args)...); - } - /* uniform_group_elements_along * * `uniform_group_elements_along(acc, group, elements)` returns a one-dimensional iteratable range that spans all @@ -984,22 +933,6 @@ namespace cms::alpakatools { return uniform_group_elements_along::value - 3>(acc, static_cast(args)...); } - /* elements_in_block - * - * `elements_in_block(acc, group, elements)` returns a one-dimensional iteratable range that spans all the elements - * within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` indicates the - * total number of elements across all groups; if not specified, it defaults to the kernel grid size. - * - * `elements_in_block(acc, ...)` is a legacy for `uniform_group_elements(acc, ...)`. - */ - - template and alpaka::Dim::value == 1>> - ALPAKA_FN_ACC inline auto elements_in_block(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along(acc, static_cast(args)...); - } - /* independent_groups_along * * `independent_groups_along(acc, groups)` returns a one-dimensional iteratable range than spans the group From 4442472b122e7b2e948f8f4f0290abc728d8576d Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Apr 2024 08:59:10 +0200 Subject: [PATCH 3/3] Split implementation to separate header files Rename classes to CamelCase and move them to the detail namespace: - uniform_elements_along to detail::UniformElementsAlong - uniform_groups_along to detail::UniformGroupsAlong - uniform_group_elements_along to detail::UniformGroupElementsAlong - uniform_elements_nd to detail::UniformElementsND - independent_groups_along to detail::IndependentGroupsAlong - independent_group_elements_along to detail::IndependentGroupElementsAlong Introduce helper functions with the old names. --- .../AlpakaInterface/interface/workdivision.h | 1183 +++++++++-------- 1 file changed, 656 insertions(+), 527 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 3475d00e91259..4647a7c6879fb 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -1,12 +1,13 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h +#include +#include #include #include #include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" namespace cms::alpakatools { @@ -78,7 +79,7 @@ namespace cms::alpakatools { /* ElementIndex * * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs - * returned by `elements_in_block` and similar functions. + * returned by `uniform_group_elements` and similar functions. */ struct ElementIndex { @@ -86,19 +87,24 @@ namespace cms::alpakatools { Idx local; }; - /* uniform_elements_along + namespace detail { + + /* UniformElementsAlong * - * `uniform_elements_along(acc [, first], extent)` returns a one-dimensional iteratable range that spans the + * `UniformElementsAlong(acc [, first], extent)` returns a one-dimensional iteratable range that spans the * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. * If `first` is not specified, it defaults to 0. * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension. * - * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`. + * `uniform_elements_along(acc, ...)` is a shorthand for `UniformElementsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + * + * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `UniformElementsAlong(acc, ...)`. * * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for - * `uniform_elements_along(acc, ...)`, `` and ``. + * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for + * `UniformElementsAlong(acc, ...)`, `` and ``. * * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. @@ -143,103 +149,105 @@ namespace cms::alpakatools { * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. */ - template and alpaka::Dim::value >= Dim>> - class uniform_elements_along { - public: - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx first, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_ + first}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } - - class const_iterator { - friend class uniform_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{std::min(first, extent)}, - index_{first_}, - range_{std::min(first + elements, extent)} {} - + template and alpaka::Dim::value >= Dim>> + class UniformElementsAlong { public: - ALPAKA_FN_ACC inline Idx operator*() const { return index_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } + + class const_iterator { + friend class UniformElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + range_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // increment the thread index with the grid stride + first_ += stride_; + index_ = first_; + range_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) return *this; - } - // increment the thread index with the grid stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if (index_ < extent_) + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + range_ = extent_; return *this; + } - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx range_; + }; private: - // non-const to support iterator copy and assignment - Idx elements_; - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - Idx index_; - Idx range_; + const Idx elements_; + const Idx first_; + const Idx stride_; + const Idx extent_; }; - private: - const Idx elements_; - const Idx first_; - const Idx stride_; - const Idx extent_; - }; + } // namespace detail /* uniform_elements * @@ -248,7 +256,7 @@ namespace cms::alpakatools { * If `first` is not specified, it defaults to 0. * If `extent` is not specified, it defaults to the kernel grid size. * - * `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`. + * `uniform_elements(acc, ...)` is a shorthand for `detail::UniformElementsAlong(acc, ...)`. * * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. @@ -301,7 +309,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) { - return uniform_elements_along(acc, static_cast(args)...); + return detail::UniformElementsAlong(acc, static_cast(args)...); + } + + /* uniform_elements_along + * + * `uniform_elements_along(acc, ...)` is a shorthand for `detail::UniformElementsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_elements_along(TAcc const& acc, TArgs... args) { + return detail::UniformElementsAlong(acc, static_cast(args)...); } /* uniform_elements_x, _y, _z @@ -313,28 +335,32 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 1>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 2>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 3>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 3>(acc, static_cast(args)...); } - /* uniform_elements_nd + namespace detail { + + /* UniformElementsND * - * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices + * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices * required to cover the given problem size, indicated by `extent`. * + * `uniform_elements_nd(acc, ...)` is an alias for `UniformElementsND(acc, ...)`. + * * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner @@ -364,194 +390,218 @@ namespace cms::alpakatools { * } * } * - * For more details, see `uniform_elements_along(acc, ...)`. + * For more details, see `UniformElementsAlong(acc, ...)`. */ - template and (alpaka::Dim::value > 0)>> - class uniform_elements_nd { - public: - using Dim = alpaka::Dim; - using Vec = alpaka::Vec; - - ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)}, - thread_{alpaka::getIdx(acc) * elements_}, - stride_{alpaka::getWorkDiv(acc) * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc, Vec extent) - : elements_{alpaka::getWorkDiv(acc)}, - thread_{alpaka::getIdx(acc) * elements_}, - stride_{alpaka::getWorkDiv(acc) * elements_}, - extent_{extent} {} - - // tag used to construct an end iterator - struct at_end_t {}; - - class const_iterator; - using iterator = const_iterator; + template and (alpaka::Dim::value > 0)>> + class UniformElementsND { + public: + using Dim = alpaka::Dim; + using Vec = alpaka::Vec; + + ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)}, + thread_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent) + : elements_{alpaka::getWorkDiv(acc)}, + thread_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{extent} {} + + // tag used to construct an end iterator + struct at_end_t {}; + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { + // check that all dimensions of the current thread index are within the extent + if ((thread_ < extent_).all()) { + // construct an iterator pointing to the first element to be processed by the current thread + return const_iterator{this, thread_}; + } else { + // construct an end iterator, pointing post the end of the extent + return const_iterator{this, at_end_t{}}; + } + } - ALPAKA_FN_ACC inline const_iterator begin() const { - // check that all dimensions of the current thread index are within the extent - if ((thread_ < extent_).all()) { - // construct an iterator pointing to the first element to be processed by the current thread - return const_iterator{this, thread_}; - } else { + ALPAKA_FN_ACC inline const_iterator end() const { // construct an end iterator, pointing post the end of the extent return const_iterator{this, at_end_t{}}; } - } - ALPAKA_FN_ACC inline const_iterator end() const { - // construct an end iterator, pointing post the end of the extent - return const_iterator{this, at_end_t{}}; - } + class const_iterator { + friend class UniformElementsND; - class const_iterator { - friend class uniform_elements_nd; + public: + ALPAKA_FN_ACC inline Vec operator*() const { return index_; } - public: - ALPAKA_FN_ACC inline Vec operator*() const { return index_; } + // pre-increment the iterator + ALPAKA_FN_ACC constexpr inline const_iterator operator++() { + increment(); + return *this; + } - // pre-increment the iterator - ALPAKA_FN_ACC constexpr inline const_iterator operator++() { - increment(); - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) { + const_iterator old = *this; + increment(); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) { - const_iterator old = *this; - increment(); - return old; - } + ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_); + } - ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_); - } + ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { + return not(*this == other); + } - ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // construct an iterator pointing to the first element to be processed by the current thread + ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first) + : loop_{loop}, + first_{alpaka::elementwise_min(first, loop->extent_)}, + range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, + index_{first_} {} - private: - // construct an iterator pointing to the first element to be processed by the current thread - ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, Vec first) - : loop_{loop}, - first_{alpaka::elementwise_min(first, loop->extent_)}, - range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, - index_{first_} {} - - // construct an end iterator, pointing post the end of the extent - ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, at_end_t const&) - : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {} - - template - ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { - bool overflow = false; - ++index_[I]; - if (index_[I] >= range_[I]) { - index_[I] = first_[I]; - overflow = true; + // construct an end iterator, pointing post the end of the extent + ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&) + : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {} + + template + ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { + bool overflow = false; + ++index_[I]; + if (index_[I] >= range_[I]) { + index_[I] = first_[I]; + overflow = true; + } + return overflow; } - return overflow; - } - template - ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { - if constexpr (N == 0) { - // overflow - return true; - } else { - if (not nth_elements_loop()) { - return false; + template + ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { + if constexpr (N == 0) { + // overflow + return true; } else { - return do_elements_loops(); + if (not nth_elements_loop()) { + return false; + } else { + return do_elements_loops(); + } } } - } - template - ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { - bool overflow = false; - first_[I] += loop_->stride_[I]; - if (first_[I] >= loop_->extent_[I]) { - first_[I] = loop_->thread_[I]; - overflow = true; + template + ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { + bool overflow = false; + first_[I] += loop_->stride_[I]; + if (first_[I] >= loop_->extent_[I]) { + first_[I] = loop_->thread_[I]; + overflow = true; + } + index_[I] = first_[I]; + range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]); + return overflow; } - index_[I] = first_[I]; - range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]); - return overflow; - } - template - ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { - if constexpr (N == 0) { - // overflow - return true; - } else { - if (not nth_strided_loop()) { - return false; + template + ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { + if constexpr (N == 0) { + // overflow + return true; } else { - return do_strided_loops(); + if (not nth_strided_loop()) { + return false; + } else { + return do_strided_loops(); + } } } - } - // increment the iterator - ALPAKA_FN_ACC inline constexpr void increment() { - if constexpr (requires_single_thread_per_block_v) { - // linear N-dimensional loops over the elements associated to the thread; - // do_elements_loops<>() returns true if any of those loops overflows - if (not do_elements_loops()) { - // the elements loops did not overflow, return the next index + // increment the iterator + ALPAKA_FN_ACC inline constexpr void increment() { + if constexpr (requires_single_thread_per_block_v) { + // linear N-dimensional loops over the elements associated to the thread; + // do_elements_loops<>() returns true if any of those loops overflows + if (not do_elements_loops()) { + // the elements loops did not overflow, return the next index + return; + } + } + + // strided N-dimensional loop over the threads in the kernel launch grid; + // do_strided_loops<>() returns true if any of those loops overflows + if (not do_strided_loops()) { + // the strided loops did not overflow, return the next index return; } - } - // strided N-dimensional loop over the threads in the kernel launch grid; - // do_strided_loops<>() returns true if any of those loops overflows - if (not do_strided_loops()) { - // the strided loops did not overflow, return the next index - return; + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = loop_->extent_; + range_ = loop_->extent_; + index_ = loop_->extent_; } - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = loop_->extent_; - range_ = loop_->extent_; - index_ = loop_->extent_; - } + // const pointer to the UniformElementsND that the iterator refers to + const UniformElementsND* loop_; - // const pointer to the uniform_elements_nd that the iterator refers to - const uniform_elements_nd* loop_; + // modified by the pre/post-increment operator + Vec first_; // first element processed by this thread + Vec range_; // last element processed by this thread + Vec index_; // current element processed by this thread + }; - // modified by the pre/post-increment operator - Vec first_; // first element processed by this thread - Vec range_; // last element processed by this thread - Vec index_; // current element processed by this thread + private: + const Vec elements_; + const Vec thread_; + const Vec stride_; + const Vec extent_; }; - private: - const Vec elements_; - const Vec thread_; - const Vec stride_; - const Vec extent_; - }; + } // namespace detail - /* uniform_groups_along + /* uniform_elements_nd + * + * `uniform_elements_nd(acc, ...)` is a shorthand for `detail::UniformElementsND(acc, ...)`. + */ + + template and (alpaka::Dim::value > 0)>> + ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc) { + return detail::UniformElementsND(acc); + } + + template and (alpaka::Dim::value > 0)>> + ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc, alpaka::Vec, Idx> extent) { + return detail::UniformElementsND(acc, extent); + } + + namespace detail { + + /* UniformGroupsAlong * - * `uniform_groups_along(acc, elements)` returns a one-dimensional iteratable range than spans the group indices + * `UniformGroupsAlong(acc, elements)` returns a one-dimensional iteratable range than spans the group indices * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements` * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size * along the `Dim` dimension. * - * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`. + * `uniform_groups_along(acc, ...)` is a shorthand for `UniformGroupsAlong(acc, ...)` that can infer + * the accelerator type from the argument. + * + * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `UniformGroupsAlong(acc, ...)`. * * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for - * `uniform_groups_along(acc, ...)`, `` and ``. + * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for + * `UniformGroupsAlong(acc, ...)`, `` and ``. * - * `uniform_groups_along` should be called consistently by all the threads in a block. All threads in a block see - * the same loop iterations, while threads in different blocks may see a different number of iterations. + * `uniform_groups_along(acc, ...)` should be called consistently by all the threads in a block. All threads in a + * block see the same loop iterations, while threads in different blocks may see a different number of iterations. * If the work division has more blocks than the required number of groups, the first blocks will perform one * iteration of the loop, while the other blocks will exit the loop immediately. * If the work division has less blocks than the required number of groups, some of the blocks will perform more than @@ -560,7 +610,7 @@ namespace cms::alpakatools { * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller * than the block size. However, also in this case all threads in the block will execute the same number of iterations * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop - * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by + * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by * `uniform_group_elements_along(acc, group, elements)`. * * For example, if the block size is 64 and there are 400 elements @@ -581,78 +631,80 @@ namespace cms::alpakatools { * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process * group 3. * - * See `uniform_elements_along(acc, ...)` for a concrete example using `uniform_groups_along` and + * See `UniformElementsAlong(acc, ...)` for a concrete example using `uniform_groups_along` and * `uniform_group_elements_along`. */ - template and alpaka::Dim::value >= Dim>> - class uniform_groups_along { - public: - ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{stride_} {} + template and alpaka::Dim::value >= Dim>> + class UniformGroupsAlong { + public: + ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{stride_} {} - // extent is the total number of elements (not blocks) - ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc, Idx extent) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{divide_up_by(extent, alpaka::getWorkDiv(acc)[Dim])} {} + // extent is the total number of elements (not blocks) + ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{divide_up_by(extent, alpaka::getWorkDiv(acc)[Dim])} {} - class const_iterator; - using iterator = const_iterator; + class const_iterator; + using iterator = const_iterator; - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } - class const_iterator { - friend class uniform_groups_along; + class const_iterator { + friend class UniformGroupsAlong; - ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) - : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} - public: - ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - // increment the first-element-in-block index by the grid stride - first_ += stride_; - if (first_ < extent_) + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; return *this; + } - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; private: - // non-const to support iterator copy and assignment - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; + const Idx first_; + const Idx stride_; + const Idx extent_; }; - private: - const Idx first_; - const Idx stride_; - const Idx extent_; - }; + } // namespace detail /* uniform_groups * @@ -660,7 +712,7 @@ namespace cms::alpakatools { * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across * all groups; if not specified, it defaults to the kernel grid size. * - * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`. + * `uniform_groups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong(acc, ...)`. * * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see * the same loop iterations, while threads in different blocks may see a different number of iterations. @@ -672,7 +724,7 @@ namespace cms::alpakatools { * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller * than the block size. However, also in this case all threads in the block will execute the same number of iterations * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop - * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by + * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by * `uniform_group_elements(acc, group, elements)`. * * For example, if the block size is 64 and there are 400 elements @@ -705,7 +757,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) { - return uniform_groups_along(acc, static_cast(args)...); + return detail::UniformGroupsAlong(acc, static_cast(args)...); + } + + /* uniform_groups_along + * + * `uniform_groups_along(acc, ...)` is a shorthand for `detail::UniformGroupsAlong(acc, ...)` that can infer + * the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_groups_along(TAcc const& acc, TArgs... args) { + return detail::UniformGroupsAlong(acc, static_cast(args)...); } /* uniform_groups_x, _y, _z @@ -717,37 +783,42 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 1>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 2>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 3>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 3>(acc, static_cast(args)...); } - /* uniform_group_elements_along + namespace detail { + + /* UniformGroupElementsAlong * - * `uniform_group_elements_along(acc, group, elements)` returns a one-dimensional iteratable range that spans all - * the elements within the given `group` along dimension `Dim`, as obtained from `uniform_groups_along`, up to + * `UniformGroupElementsAlong(acc, group, elements)` returns a one-dimensional iteratable range that spans + * all the elements within the given `group` along dimension `Dim`, as obtained from `UniformGroupsAlong`, up to * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it * defaults to the kernel grid size. * + * `uniform_group_elements_along(acc, ...)` is a shorthand for `UniformGroupElementsAlong(acc, ...)` + * that can infer the accelerator type from the argument. + * * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for - * `uniform_group_elements_along<0>(acc, ...)`. + * `UniformGroupElementsAlong<0>(acc, ...)`. * - * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by + * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are - * shorthands for `uniform_group_elements_along(acc, ...)`, `` and ``. + * shorthands for `UniformGroupElementsAlong(acc, ...)`, `` and ``. * * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the @@ -759,7 +830,7 @@ namespace cms::alpakatools { * If the problem size is not a multiple of the block size, different threads may execute a different number of * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block * synchronisation is needed, one should split the loop, and synchronise the threads between the loops. - * See `uniform_elements_along(acc, ...)` for a concrete example using `uniform_groups_along` and + * See `UniformElementsAlong(acc, ...)` for a concrete example using `uniform_groups_along` and * `uniform_group_elements_along`. * * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a @@ -782,78 +853,80 @@ namespace cms::alpakatools { * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. */ - template and alpaka::Dim::value >= Dim>> - class uniform_group_elements_along { - public: - ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block) - : first_{block * alpaka::getWorkDiv(acc)[Dim]}, - local_{alpaka::getIdx(acc)[Dim] * - alpaka::getWorkDiv(acc)[Dim]}, - range_{local_ + alpaka::getWorkDiv(acc)[Dim]} {} - - ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block, Idx extent) - : first_{block * alpaka::getWorkDiv(acc)[Dim]}, - local_{std::min(extent - first_, - alpaka::getIdx(acc)[Dim] * - alpaka::getWorkDiv(acc)[Dim])}, - range_{std::min(extent - first_, local_ + alpaka::getWorkDiv(acc)[Dim])} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); } - - class const_iterator { - friend class uniform_group_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range) - : index_{local}, first_{first}, range_{range} {} - + template and alpaka::Dim::value >= Dim>> + class UniformGroupElementsAlong { public: - ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) - return *this; + ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block) + : first_{block * alpaka::getWorkDiv(acc)[Dim]}, + local_{alpaka::getIdx(acc)[Dim] * + alpaka::getWorkDiv(acc)[Dim]}, + range_{local_ + alpaka::getWorkDiv(acc)[Dim]} {} + + ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent) + : first_{block * alpaka::getWorkDiv(acc)[Dim]}, + local_{std::min(extent - first_, + alpaka::getIdx(acc)[Dim] * + alpaka::getWorkDiv(acc)[Dim])}, + range_{std::min(extent - first_, local_ + alpaka::getWorkDiv(acc)[Dim])} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); } + + class const_iterator { + friend class UniformGroupElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range) + : index_{local}, first_{first}, range_{range} {} + + public: + ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + index_ = range_; + return *this; } - // the iterator has reached or passed the end of the extent, clamp it to the extent - index_ = range_; - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); } - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); } + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // modified by the pre/post-increment operator + Idx index_; + // non-const to support iterator copy and assignment + Idx first_; + Idx range_; + }; private: - // modified by the pre/post-increment operator - Idx index_; - // non-const to support iterator copy and assignment - Idx first_; - Idx range_; + const Idx first_; + const Idx local_; + const Idx range_; }; - private: - const Idx first_; - const Idx local_; - const Idx range_; - }; + } // namespace detail /* uniform_group_elements * @@ -861,7 +934,7 @@ namespace cms::alpakatools { * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size. * - * `uniform_group_elements(acc, ...)` is a shorthand for `uniform_group_elements_along<0>(acc, ...)`. + * `uniform_group_elements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`. * * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the @@ -894,7 +967,7 @@ namespace cms::alpakatools { * * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional * kernels, use - * - `uniform_group_elements_along(acc, ...)` to perform the iteration explicitly along dimension `Dim`; + * - `detail::UniformGroupElementsAlong(acc, ...)` to perform the iteration explicitly along dimension `Dim`; * - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or * `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension. */ @@ -903,7 +976,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong(acc, static_cast(args)...); + } + + /* uniform_group_elements_along + * + * `uniform_group_elements_along(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong(acc, ...)` + * that can infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_group_elements_along(TAcc const& acc, TArgs... args) { + return detail::UniformGroupElementsAlong(acc, static_cast(args)...); } /* uniform_group_elements_x, _y, _z @@ -916,39 +1003,44 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 1>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 2>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 3>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 3>(acc, static_cast(args)...); } - /* independent_groups_along + namespace detail { + + /* IndependentGroupsAlong * - * `independent_groups_along(acc, groups)` returns a one-dimensional iteratable range than spans the group + * `IndependentGroupsAlong(acc, groups)` returns a one-dimensional iteratable range than spans the group * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not * specified, it defaults to the number of blocks along the `Dim` dimension. * + * `independent_groups_along(acc, ...)` is a shorthand for `IndependentGroupsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + * * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for - * `independent_groups_along<0>(acc, ...)`. + * `IndependentGroupsAlong(acc, ...)`. * * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands - * for `independent_groups_along(acc, ...)`, `` and ``. + * for `IndependentGroupsAlong(acc, ...)`, `` and ``. * - * `independent_groups_along` should be called consistently by all the threads in a block. All threads in a block - * see the same loop iterations, while threads in different blocks may see a different number of iterations. + * `independent_groups_along(acc, ...)` should be called consistently by all the threads in a block. All threads + * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations. * If the work division has more blocks than the required number of groups, the first blocks will perform one * iteration of the loop, while the other blocks will exit the loop immediately. * If the work division has less blocks than the required number of groups, some of the blocks will perform more than @@ -968,80 +1060,82 @@ namespace cms::alpakatools { * group 3. */ - template and alpaka::Dim::value >= Dim>> - class independent_groups_along { - public: - ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{stride_} {} + template and alpaka::Dim::value >= Dim>> + class IndependentGroupsAlong { + public: + ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{stride_} {} - ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc, Idx groups) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{groups} {} + ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{groups} {} - class const_iterator; - using iterator = const_iterator; + class const_iterator; + using iterator = const_iterator; - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } - class const_iterator { - friend class independent_groups_along; + class const_iterator { + friend class IndependentGroupsAlong; - ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) - : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} - public: - ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - // increment the first-element-in-block index by the grid stride - first_ += stride_; - if (first_ < extent_) + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; return *this; + } - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; private: - // non-const to support iterator copy and assignment - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; + const Idx first_; + const Idx stride_; + const Idx extent_; }; - private: - const Idx first_; - const Idx stride_; - const Idx extent_; - }; + } // namespace detail /* independent_groups * * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to * `groups`. If `groups` is not specified, it defaults to the number of blocks. * - * `independent_groups(acc, ...)` is a shorthand for `independent_groups_along<0>(acc, ...)`. + * `independent_groups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong(acc, ...)`. * * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block * see the same loop iterations, while threads in different blocks may see a different number of iterations. @@ -1074,7 +1168,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) { - return independent_groups_along(acc, static_cast(args)...); + return detail::IndependentGroupsAlong(acc, static_cast(args)...); + } + + /* independent_groups_along + * + * `independent_groups_along(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto independent_groups_along(TAcc const& acc, TArgs... args) { + return detail::IndependentGroupsAlong(acc, static_cast(args)...); } /* independent_groups_x, _y, _z @@ -1087,123 +1195,130 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 1>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 2>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 3>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 3>(acc, static_cast(args)...); } - /* independent_group_elements_along - */ + namespace detail { - template and alpaka::Dim::value >= Dim>> - class independent_group_elements_along { - public: - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx first, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_ + first}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } - - class const_iterator { - friend class independent_group_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{std::min(first, extent)}, - index_{first_}, - range_{std::min(first + elements, extent)} {} + /* IndependentGroupElementsAlong + * + * `independent_group_elements_along(acc, ...)` is a shorthand for + * `IndependentGroupElementsAlong(acc, ...)` that can infer the accelerator type from the argument. + */ + template and alpaka::Dim::value >= Dim>> + class IndependentGroupElementsAlong { public: - ALPAKA_FN_ACC inline Idx operator*() const { return index_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } + + class const_iterator { + friend class IndependentGroupElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + range_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // increment the thread index with the block stride + first_ += stride_; + index_ = first_; + range_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) return *this; - } - // increment the thread index with the block stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if (index_ < extent_) + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + range_ = extent_; return *this; + } - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; - return *this; - } + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx range_; + }; private: - // non-const to support iterator copy and assignment - Idx elements_; - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - Idx index_; - Idx range_; + const Idx elements_; + const Idx thread_; + const Idx stride_; + const Idx extent_; }; - private: - const Idx elements_; - const Idx thread_; - const Idx stride_; - const Idx extent_; - }; + } // namespace detail /* independent_group_elements */ @@ -1212,7 +1327,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) { - return independent_group_elements_along(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong(acc, static_cast(args)...); + } + + /* independent_group_elements_along + * + * `independent_group_elements_along(acc, ...)` is a shorthand for + * `detail::IndependentGroupElementsAlong(acc, ...)` that can infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto independent_group_elements_along(TAcc const& acc, TArgs... args) { + return detail::IndependentGroupElementsAlong(acc, static_cast(args)...); } /* independent_group_elements_x, _y, _z @@ -1225,21 +1354,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 1>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 2>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 3>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 3>(acc, static_cast(args)...); } /* once_per_grid