Skip to content

Commit

Permalink
Merge pull request #44714 from fwyzard/alpaka_explicit_loop_names_140x
Browse files Browse the repository at this point in the history
Reorganise alpaka kernel loop functions [14.0.x]
  • Loading branch information
cmsbuild authored Apr 22, 2024
2 parents 69f4504 + 4442472 commit c176f86
Show file tree
Hide file tree
Showing 8 changed files with 680 additions and 618 deletions.
1,226 changes: 644 additions & 582 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
for (int32_t i : uniform_elements(acc, view.metadata().size())) {
view[i] = {xvalue, 0., 0., i, flags, matrix * i};
}
}
Expand All @@ -52,7 +52,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
for (int32_t i : uniform_elements(acc, view.metadata().size())) {
view[i] = {xvalue, 0., 0., i, matrix * i};
}
}
Expand All @@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, view.metadata().size())) {
for (int32_t i : uniform_elements(acc, view.metadata().size())) {
view[i] = {xvalue, 0., 0., i, matrix * i};
}
}
Expand Down Expand Up @@ -174,7 +174,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
for (int32_t i : uniform_elements(acc, output.metadata().size())) {
double x = input[i].x();
if (i < esData.size()) {
x += esData.val(i) + esData.val2(i);
Expand All @@ -200,14 +200,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
for (int32_t i : uniform_elements(acc, output.metadata().size())) {
double x = input[i].x();
if (i < esData.size()) {
x += esData.val(i) + esData.val2(i);
}
output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
}
for (int32_t i : elements_with_stride(acc, output2.metadata().size())) {
for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
double x2 = input2[i].x2();
if (i < esData.size()) {
x2 += esData.val(i) + esData.val2(i);
Expand Down Expand Up @@ -236,7 +236,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, output.metadata().size())) {
for (int32_t i : uniform_elements(acc, output.metadata().size())) {
double x = input[i].x();
if (i < esData.size()) {
x += esData.val(i) + esData.val2(i);
Expand All @@ -245,14 +245,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
}
output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
}
for (int32_t i : elements_with_stride(acc, output2.metadata().size())) {
for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
double x2 = input2[i].x2();
if (i < esData.size()) {
x2 += esData.val(i) + esData.val2(i);
}
output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
}
for (int32_t i : elements_with_stride(acc, output3.metadata().size())) {
for (int32_t i : uniform_elements(acc, output3.metadata().size())) {
double x3 = input3[i].x3();
if (i < esData.size()) {
x3 += esData.val(i) + esData.val2(i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto* shr_hasSwitchToGain0_tmp = shr_isSaturated + elemsPerBlock;
auto* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + elemsPerBlock;

for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) {
for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
// set the output collection size scalars
if (idx.global == 0) {
uncalibRecHitsEB.size() = nchannelsEB;
Expand All @@ -91,7 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {

alpaka::syncBlockThreads(acc);

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const sample = idx.local % nsamples;

// non-divergent branch (except for the last 4 threads)
Expand All @@ -118,7 +118,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {

alpaka::syncBlockThreads(acc);

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const sample = idx.local % nsamples;

if (sample < 2) {
Expand All @@ -141,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {

alpaka::syncBlockThreads(acc);

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ch = idx.global / nsamples;
auto const sample = idx.local % nsamples;

Expand All @@ -164,7 +164,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
// check if we can remove it
alpaka::syncBlockThreads(acc);

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ch = idx.global / nsamples;
auto const sample = idx.local % nsamples;

Expand Down Expand Up @@ -355,7 +355,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto const elemsPerBlockY = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
Vec2D const size_2d = {elemsPerBlockY, blockDimX * elemsPerBlockX}; // {y, x} coordinates

for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size_2d)) {
for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size_2d)) {
auto const ch = ndindex[1] / nsamples;
auto const tx = ndindex[1] % nsamples;
auto const ty = ndindex[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
DataType* shrmem = alpaka::getDynSharedMem<DataType>(acc);

// channel
for (auto idx : cms::alpakatools::elements_with_stride(acc, nchannels)) {
for (auto idx : cms::alpakatools::uniform_elements(acc, nchannels)) {
if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
continue;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto* s_sumA = s_sum1 + elemsPerBlock;
auto* s_sumAA = s_sumA + elemsPerBlock;

for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) {
for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
// go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
auto tx = nchannels * nsamples - 1 - txforward;
auto const ch = tx / nsamples;
Expand Down Expand Up @@ -163,8 +163,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock;
auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock;

for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) {
for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ch = idx.global / nthreads_per_channel;
auto const ltx = idx.global % nthreads_per_channel;

Expand Down Expand Up @@ -396,7 +396,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
bool oddElements = nthreads_per_channel % 2;
CMS_UNROLL_LOOP
while (iter >= 1) {
for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ltx = idx.global % nthreads_per_channel;

if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
Expand All @@ -411,7 +411,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
}

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ltx = idx.global % nthreads_per_channel;

// get precomputedflags for this element from shared memory
Expand Down Expand Up @@ -459,7 +459,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
oddElements = nthreads_per_channel % 2;
CMS_UNROLL_LOOP
while (iter >= 1) {
for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ltx = idx.global % nthreads_per_channel;

if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
Expand All @@ -475,7 +475,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
}

for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) {
for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
auto const ltx = idx.global % nthreads_per_channel;

// load from shared memory the 0th guy (will contain accumulated values)
Expand Down Expand Up @@ -559,7 +559,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto* shr_sumAf = alpaka::getDynSharedMem<ScalarType>(acc);
auto* shr_sumff = shr_sumAf + elemsPerBlock;

for (auto gtxforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) {
for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
// go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
auto gtx = nchannels * nsamples - 1 - gtxforward;
auto const ch = gtx / nsamples;
Expand Down Expand Up @@ -744,7 +744,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {

auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];

for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannelsEB * nsamples)) {
for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) {
auto const elemIdx = gtx % elemsPerBlock;
auto const sample = elemIdx % nsamples;
auto const ch = gtx / nsamples;
Expand Down Expand Up @@ -800,7 +800,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto* shrSampleValues = alpaka::getDynSharedMem<ScalarType>(acc);
auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock;

for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) {
for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
// go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
auto tx = nchannels * nsamples - 1 - txforward;
auto const ch = tx / nsamples;
Expand Down Expand Up @@ -988,7 +988,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
auto const offsetForInputs = nchannelsEB;
auto const offsetForHashes = conditionsDev.offsetEE();

for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannels)) {
for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) {
const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const {
const int nRH = pfRecHits.size();
for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) {
for (int v : cms::alpakatools::uniform_elements(acc, nRH)) {
const int beg = pfClusteringEdgeVars[v].pfrh_edgeIdx();
const int end = pfClusteringEdgeVars[v + 1].pfrh_edgeIdx();
int m = v;
Expand All @@ -110,7 +110,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const {
const int nRH = pfRecHits.size();

for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) {
for (int v : cms::alpakatools::uniform_elements(acc, nRH)) {
const int vstat = pfClusteringVars[v].pfrh_topoId();
if (v != vstat) {
const int beg = pfClusteringEdgeVars[v].pfrh_edgeIdx();
Expand Down Expand Up @@ -155,7 +155,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars) const {
const int nRH = pfRecHits.size();

for (int v : cms::alpakatools::elements_with_stride(acc, nRH)) {
for (int v : cms::alpakatools::uniform_elements(acc, nRH)) {
int next, vstat = pfClusteringVars[v].pfrh_topoId();
const int old = vstat;
while (vstat > (next = pfClusteringVars[vstat].pfrh_topoId())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1098,7 +1098,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
clusterView.size() = nRH;
}

for (auto i : elements_with_stride(acc, nRH)) {
for (auto i : uniform_elements(acc, nRH)) {
// Initialize arrays
pfClusteringVars[i].pfrh_isSeed() = 0;
pfClusteringVars[i].rhCount() = 0;
Expand Down Expand Up @@ -1176,7 +1176,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
pfClusteringVars.nEdges() = nRH * 8;
pfClusteringEdgeVars[nRH].pfrh_edgeIdx() = nRH * 8;
}
for (uint32_t i : cms::alpakatools::elements_with_stride(acc, nRH)) {
for (uint32_t i : cms::alpakatools::uniform_elements(acc, nRH)) {
pfClusteringEdgeVars[i].pfrh_edgeIdx() = i * 8;
pfClusteringVars[i].pfrh_topoId() = 0;
for (int j = 0; j < 8; j++) { // checking if neighbours exist and assigning neighbours as edges
Expand Down Expand Up @@ -1323,7 +1323,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
reco::PFRecHitFractionDeviceCollection::View fracView) const {
const int nRH = pfRecHits.size();

for (auto index : elements_with_stride_nd(acc, {nRH, nRH})) {
for (auto index : uniform_elements_nd(acc, {nRH, nRH})) {
const int i = index[0u]; // i is a seed index
const int j = index[1u]; // j is NOT a seed
int topoId = pfClusteringVars[i].pfrh_topoId();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
uint32_t* __restrict__ denseId2pfRecHit,
uint32_t* __restrict__ num_pfRecHits) const {
// Strided loop over CaloRecHits
for (int32_t i : cms::alpakatools::elements_with_stride(acc, recHits.metadata().size())) {
for (int32_t i : cms::alpakatools::uniform_elements(acc, recHits.metadata().size())) {
// Check energy thresholds/quality cuts (specialised for HCAL/ECAL)
if (!applyCuts(recHits[i], params, topology))
continue;
Expand Down Expand Up @@ -142,7 +142,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
pfRecHits.size() = *num_pfRecHits;

// Assign position information and associate neighbours
for (int32_t i : cms::alpakatools::elements_with_stride(acc, *num_pfRecHits)) {
for (int32_t i : cms::alpakatools::uniform_elements(acc, *num_pfRecHits)) {
const uint32_t denseId = CAL::detId2denseId(pfRecHits.detId(i));

pfRecHits.x(i) = topology.positionX(denseId);
Expand Down

0 comments on commit c176f86

Please sign in to comment.