Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[alpaka] Port Clusterizer to Alpaka #172

Merged
merged 24 commits into from
Apr 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6c3315a
[alpaka] Add clusterizer. Compiles, but still issues to fix at run time.
ghugo83 Feb 11, 2021
8e829ef
Define GPU_DEBUG true
ghugo83 Feb 11, 2021
10c30fd
Merge branch 'workdiv_helper' into clusterizer
ghugo83 Feb 15, 2021
9a5bf29
Change clusterizer code to use new helepr functions. There were issue…
ghugo83 Feb 15, 2021
7fa84a1
Cannot use break and continue in lambda, hence replace by equivalent …
ghugo83 Feb 16, 2021
4157e45
fix issues for run in cuda case, spotted in debug phase.
ghugo83 Feb 17, 2021
baa89ea
Fix break behavior, which surpringly does not seem to have effect on …
ghugo83 Feb 17, 2021
fc7820f
Fixes run time issues in serial and TBB cases.
ghugo83 Feb 17, 2021
2bf4318
Merge remote-tracking branch 'origin/master' into clusterizer_none
ghugo83 Feb 17, 2021
5e90ffd
Merge branch 'clusterizer_none' into clusterizer
ghugo83 Feb 17, 2021
7f82936
remove debug printouts
ghugo83 Feb 17, 2021
56aa5f4
Add handling of a constexpr threadDimension. Issue with tbb run time?
ghugo83 Feb 17, 2021
58ba6ad
Clean intricated conditional statements.
ghugo83 Feb 17, 2021
8baaa52
Remove GPU_DEBUG true
ghugo83 Feb 17, 2021
e92a8ed
Forgot to remove commented debug line
ghugo83 Feb 17, 2021
9513add
Cannot break out of lambda. Need to have the presence of a foor loop.…
ghugo83 Feb 19, 2021
8dfc6d5
clang-format
ghugo83 Feb 19, 2021
6bab054
threadDimension set to 1 for CPU case, to match what is done in legac…
ghugo83 Feb 19, 2021
1dc7e39
Set iter to same value as for legacy. Not nice to have size of array …
ghugo83 Feb 19, 2021
f64f181
Place std::min outside of inner loop
ghugo83 Mar 2, 2021
6b16234
Rename gpuClustering_t.cc into clustering.cc
ghugo83 Mar 2, 2021
7e52389
Merge remote-tracking branch 'origin/master' into clusterizer_pr
ghugo83 Mar 30, 2021
153235a
small corrections
ghugo83 Mar 31, 2021
08cfa6c
small corrections
ghugo83 Apr 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ namespace cms {
threadIdx < maxNumberOfElements;
threadIdx += blockDimension, endElementIdx += blockDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
if (endElementIdx > maxNumberOfElements) {
endElementIdx = maxNumberOfElements;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have kept endElementIdx = std::min(endElementIdx, maxNumberOfElements), but probably doesn't make much difference.

for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
func(i);
}
}
Expand Down Expand Up @@ -274,7 +277,10 @@ namespace cms {
threadIdx < maxNumberOfElements;
threadIdx += gridDimension, endElementIdx += gridDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
if (endElementIdx > maxNumberOfElements) {
endElementIdx = maxNumberOfElements;
}
for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
func(i);
}
}
Expand All @@ -291,6 +297,27 @@ namespace cms {
cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
}

/*
* Case where the input index has reached the end of threadDimension: strides the input index.
* Otherwise: do nothing.
*/
ALPAKA_FN_ACC bool get_next_element_1D_index_stride(uint32_t& i,
uint32_t& firstElementIdx,
uint32_t& endElementIdx,
const uint32_t stride,
const uint32_t maxNumberOfElements) {
bool isNextStrideElementValid = true;
if (i == endElementIdx) {
firstElementIdx += stride;
endElementIdx += stride;
i = firstElementIdx;
if (i >= maxNumberOfElements) {
isNextStrideElementValid = false;
}
}
return isNextStrideElementValid;
}

} // namespace alpakatools
} // namespace cms

Expand Down
174 changes: 174 additions & 0 deletions src/alpaka/Geometry/phase1PixelTopology.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

constexpr uint16_t numRowsInRoc = 80;
constexpr uint16_t numColsInRoc = 52;
constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
constexpr uint16_t lastColInRoc = numColsInRoc - 1;

constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
constexpr uint16_t numColsInModule = 8 * numColsInRoc;
constexpr uint16_t lastRowInModule = numRowsInModule - 1;
constexpr uint16_t lastColInModule = numColsInModule - 1;

constexpr int16_t xOffset = -81;
constexpr int16_t yOffset = -54 * 4;

constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);

constexpr uint32_t numberOfModules = 1856;
constexpr uint32_t numberOfLayers = 10;
constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
96,
320,
672, // barrel
1184,
1296,
1408, // positive endcap
1520,
1632,
1744, // negative endcap
numberOfModules};
constexpr char const* layerName[numberOfLayers] = {
"BL1",
"BL2",
"BL3",
"BL4", // barrel
"E+1",
"E+2",
"E+3", // positive endcap
"E-1",
"E-2",
"E-3" // negative endcap
};

constexpr uint32_t numberOfModulesInBarrel = 1184;
constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8;

template <class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> {
return {{f(Indices)...}};
}

template <int N, class Function>
constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> {
return map_to_array_helper(f, std::make_index_sequence<N>{});
}

constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n = 2;
while (go) {
for (uint8_t i = 1; i < 11; ++i) {
if (layerStart[i] % n != 0) {
go = false;
break;
}
}
if (!go)
break;
n *= 2;
}
return n / 2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();

constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i = 0; i < 11; ++i)
if (detId < layerStart[i + 1])
return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId *= maxModuleStride;
for (uint8_t i = 0; i < 11; ++i)
if (detId < layerStart[i + 1])
return i;
return 11;
}

constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res = true;
for (auto i = 0U; i < numberOfModules; ++i) {
auto j = i / maxModuleStride;
res &= (layer[j] < 10);
res &= (i >= layerStart[layer[j]]);
res &= (i < layerStart[layer[j] + 1]);
}
return res;
}

static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");

// this is for the ROC n<512 (upgrade 1024)
constexpr inline uint16_t divu52(uint16_t n) {
n = n >> 2;
uint16_t q = (n >> 1) + (n >> 4);
q = q + (q >> 4) + (q >> 5);
q = q >> 3;
uint16_t r = n - q * 13;
return q + ((r + 3) >> 4);
}

constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }

constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }

constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }

constexpr inline uint16_t toRocY(uint16_t py) {
auto roc = divu52(py);
return py - 52 * roc;
}

constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }

constexpr inline bool isBigPixY(uint16_t py) {
auto ly = toRocY(py);
return (ly == 0) | (ly == lastColInRoc);
}

constexpr inline uint16_t localX(uint16_t px) {
auto shift = 0;
if (px > lastRowInRoc)
shift += 1;
if (px > numRowsInRoc)
shift += 1;
return px + shift;
}

constexpr inline uint16_t localY(uint16_t py) {
auto roc = divu52(py);
auto shift = 2 * roc;
auto yInRoc = py - 52 * roc;
if (yInRoc > 0)
shift += 1;
return py + shift;
}

//FIXME move it elsewhere?
struct AverageGeometry {
static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel;
float ladderZ[numberOfLaddersInBarrel];
float ladderX[numberOfLaddersInBarrel];
float ladderY[numberOfLaddersInBarrel];
float ladderR[numberOfLaddersInBarrel];
float ladderMinZ[numberOfLaddersInBarrel];
float ladderMaxZ[numberOfLaddersInBarrel];
float endCapZ[2]; // just for pos and neg Layer1
};

} // namespace phase1PixelTopology

#endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
Empty file.
147 changes: 147 additions & 0 deletions src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h

#include <cstdint>
#include <cstdio>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"
#include "AlpakaCore/prefixScan.h"
#include "AlpakaDataFormats/gpuClusteringConstants.h"

namespace gpuClustering {

struct clusterChargeCut {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(
const T_Acc& acc,
uint16_t* __restrict__ id, // module id of each pixel (modified if bad cluster)
uint16_t const* __restrict__ adc, // charge of each pixel
uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
uint32_t* __restrict__ nClustersInModule, // modified: number of clusters found in each module
uint32_t const* __restrict__ moduleId, // module id of each module
int32_t* __restrict__ clusterId, // modified: cluster id of each pixel
const uint32_t numElements) const {
const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
if (blockIdx >= moduleStart[0])
return;

auto firstPixel = moduleStart[1 + blockIdx];
auto thisModuleId = id[firstPixel];
assert(thisModuleId < MaxNumModules);
assert(thisModuleId == moduleId[blockIdx]);

auto nclus = nClustersInModule[thisModuleId];
if (nclus == 0)
return;

const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules)
printf("Warning too many clusters in module %d in block %d: %d > %d\n",
thisModuleId,
blockIdx,
nclus,
MaxNumClustersPerModules);

// Stride = block size.
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);

// Get thread / CPU element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel));

if (nclus > MaxNumClustersPerModules) {
uint32_t firstElementIdx = firstElementIdxNoStride[0u];
uint32_t endElementIdx = endElementIdxNoStride[0u];
// remove excess FIXME find a way to cut charge first....
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
if (clusterId[i] >= MaxNumClustersPerModules) {
id[i] = InvId;
clusterId[i] = InvId;
}
}
nclus = MaxNumClustersPerModules;
}

#ifdef GPU_DEBUG
if (thisModuleId % 100 == 1)
if (threadIdxLocal == 0)
printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx);
#endif

auto&& charge = alpaka::block::shared::st::allocVar<int32_t[MaxNumClustersPerModules], __COUNTER__>(acc);
auto&& ok = alpaka::block::shared::st::allocVar<uint8_t[MaxNumClustersPerModules], __COUNTER__>(acc);
auto&& newclusId = alpaka::block::shared::st::allocVar<uint16_t[MaxNumClustersPerModules], __COUNTER__>(acc);

assert(nclus <= MaxNumClustersPerModules);
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; });
alpaka::block::sync::syncBlockThreads(acc);

uint32_t firstElementIdx = firstElementIdxNoStride[0u];
uint32_t endElementIdx = endElementIdxNoStride[0u];
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the main reason for this function (instead of for_each_element_...) the fact that loop body does break?

Copy link
Contributor Author

@ghugo83 ghugo83 Mar 31, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes the issue here was that 2 loops are introduced with Alpaka: one for the strided access, and an inner one for the CPU elements. Whether using a helper function and a lambda, or directly doing the looping on call site, one faces this 2-loops situation.
This was an issue when the legacy code had break statements, since obviously only the innermost loop was exited. A trivial way around I had done first, was of course to use a boolean, to exit the innermost loop when needed.
But in a sense, the gymnastics with the boolean had to be repeated each time there was such a block on call site, hence code duplication.

This helper function allows to have the same logic as 2 loops, but with only 1 loop and no extra storage, which makes the portability of the legacy code easier: nothing has to be changed regarding break statements on call site and they can be kept 1:1.

i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &charge[clusterId[i]], static_cast<int32_t>(adc[i]));
}
alpaka::block::sync::syncBlockThreads(acc);

auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?)
cms::alpakatools::for_each_element_1D_block_stride(
acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; });
alpaka::block::sync::syncBlockThreads(acc);

// renumber
auto&& ws = alpaka::block::shared::st::allocVar<uint16_t[32], __COUNTER__>(acc);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 32 here is related to the warp size, right? We should probably look later on how to make that dependent on acc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes totally.

cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws);

assert(nclus >= newclusId[nclus - 1]);

if (nclus == newclusId[nclus - 1])
return;

nClustersInModule[thisModuleId] = newclusId[nclus - 1];
alpaka::block::sync::syncBlockThreads(acc);

// mark bad cluster again
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) {
if (0 == ok[i])
newclusId[i] = InvId + 1;
});
alpaka::block::sync::syncBlockThreads(acc);

// reassign id
firstElementIdx = firstElementIdxNoStride[0u];
endElementIdx = endElementIdxNoStride[0u];
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
clusterId[i] = newclusId[clusterId[i]] - 1;
if (clusterId[i] == InvId)
id[i] = InvId;
}

//done
}
};

} // namespace gpuClustering

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
Loading