Skip to content

Commit

Permalink
Merge pull request #172 from ghugo83/clusterizer
Browse files Browse the repository at this point in the history
[alpaka] Port Clusterizer to Alpaka
  • Loading branch information
makortel authored Apr 1, 2021
2 parents c2bcd58 + 08cfa6c commit 171f24d
Show file tree
Hide file tree
Showing 7 changed files with 1,087 additions and 10 deletions.
31 changes: 29 additions & 2 deletions src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ namespace cms {
threadIdx < maxNumberOfElements;
threadIdx += blockDimension, endElementIdx += blockDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
if (endElementIdx > maxNumberOfElements) {
endElementIdx = maxNumberOfElements;
}
for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
func(i);
}
}
Expand Down Expand Up @@ -274,7 +277,10 @@ namespace cms {
threadIdx < maxNumberOfElements;
threadIdx += gridDimension, endElementIdx += gridDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
if (endElementIdx > maxNumberOfElements) {
endElementIdx = maxNumberOfElements;
}
for (uint32_t i = threadIdx; i < endElementIdx; ++i) {
func(i);
}
}
Expand All @@ -291,6 +297,27 @@ namespace cms {
cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
}

/*
* Case where the input index has reached the end of threadDimension: strides the input index.
* Otherwise: do nothing.
*/
ALPAKA_FN_ACC bool get_next_element_1D_index_stride(uint32_t& i,
uint32_t& firstElementIdx,
uint32_t& endElementIdx,
const uint32_t stride,
const uint32_t maxNumberOfElements) {
bool isNextStrideElementValid = true;
if (i == endElementIdx) {
firstElementIdx += stride;
endElementIdx += stride;
i = firstElementIdx;
if (i >= maxNumberOfElements) {
isNextStrideElementValid = false;
}
}
return isNextStrideElementValid;
}

} // namespace alpakatools
} // namespace cms

Expand Down
174 changes: 174 additions & 0 deletions src/alpaka/Geometry/phase1PixelTopology.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h

#include <cstdint>
#include <array>

namespace phase1PixelTopology {

constexpr uint16_t numRowsInRoc = 80;
constexpr uint16_t numColsInRoc = 52;
constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
constexpr uint16_t lastColInRoc = numColsInRoc - 1;

constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
constexpr uint16_t numColsInModule = 8 * numColsInRoc;
constexpr uint16_t lastRowInModule = numRowsInModule - 1;
constexpr uint16_t lastColInModule = numColsInModule - 1;

constexpr int16_t xOffset = -81;
constexpr int16_t yOffset = -54 * 4;

constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);

constexpr uint32_t numberOfModules = 1856;
constexpr uint32_t numberOfLayers = 10;
constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
96,
320,
672, // barrel
1184,
1296,
1408, // positive endcap
1520,
1632,
1744, // negative endcap
numberOfModules};
constexpr char const* layerName[numberOfLayers] = {
"BL1",
"BL2",
"BL3",
"BL4", // barrel
"E+1",
"E+2",
"E+3", // positive endcap
"E-1",
"E-2",
"E-3" // negative endcap
};

constexpr uint32_t numberOfModulesInBarrel = 1184;
constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8;

template <class Function, std::size_t... Indices>
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> {
return {{f(Indices)...}};
}

template <int N, class Function>
constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> {
return map_to_array_helper(f, std::make_index_sequence<N>{});
}

constexpr uint32_t findMaxModuleStride() {
bool go = true;
int n = 2;
while (go) {
for (uint8_t i = 1; i < 11; ++i) {
if (layerStart[i] % n != 0) {
go = false;
break;
}
}
if (!go)
break;
n *= 2;
}
return n / 2;
}

constexpr uint32_t maxModuleStride = findMaxModuleStride();

constexpr uint8_t findLayer(uint32_t detId) {
for (uint8_t i = 0; i < 11; ++i)
if (detId < layerStart[i + 1])
return i;
return 11;
}

constexpr uint8_t findLayerFromCompact(uint32_t detId) {
detId *= maxModuleStride;
for (uint8_t i = 0; i < 11; ++i)
if (detId < layerStart[i + 1])
return i;
return 11;
}

constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);

constexpr bool validateLayerIndex() {
bool res = true;
for (auto i = 0U; i < numberOfModules; ++i) {
auto j = i / maxModuleStride;
res &= (layer[j] < 10);
res &= (i >= layerStart[layer[j]]);
res &= (i < layerStart[layer[j] + 1]);
}
return res;
}

static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");

// this is for the ROC n<512 (upgrade 1024)
constexpr inline uint16_t divu52(uint16_t n) {
n = n >> 2;
uint16_t q = (n >> 1) + (n >> 4);
q = q + (q >> 4) + (q >> 5);
q = q >> 3;
uint16_t r = n - q * 13;
return q + ((r + 3) >> 4);
}

constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }

constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }

constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }

constexpr inline uint16_t toRocY(uint16_t py) {
auto roc = divu52(py);
return py - 52 * roc;
}

constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }

constexpr inline bool isBigPixY(uint16_t py) {
auto ly = toRocY(py);
return (ly == 0) | (ly == lastColInRoc);
}

constexpr inline uint16_t localX(uint16_t px) {
auto shift = 0;
if (px > lastRowInRoc)
shift += 1;
if (px > numRowsInRoc)
shift += 1;
return px + shift;
}

constexpr inline uint16_t localY(uint16_t py) {
auto roc = divu52(py);
auto shift = 2 * roc;
auto yInRoc = py - 52 * roc;
if (yInRoc > 0)
shift += 1;
return py + shift;
}

//FIXME move it elsewhere?
struct AverageGeometry {
static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel;
float ladderZ[numberOfLaddersInBarrel];
float ladderX[numberOfLaddersInBarrel];
float ladderY[numberOfLaddersInBarrel];
float ladderR[numberOfLaddersInBarrel];
float ladderMinZ[numberOfLaddersInBarrel];
float ladderMaxZ[numberOfLaddersInBarrel];
float endCapZ[2]; // just for pos and neg Layer1
};

} // namespace phase1PixelTopology

#endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
Empty file.
147 changes: 147 additions & 0 deletions src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h

#include <cstdint>
#include <cstdio>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"
#include "AlpakaCore/prefixScan.h"
#include "AlpakaDataFormats/gpuClusteringConstants.h"

namespace gpuClustering {

struct clusterChargeCut {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(
const T_Acc& acc,
uint16_t* __restrict__ id, // module id of each pixel (modified if bad cluster)
uint16_t const* __restrict__ adc, // charge of each pixel
uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
uint32_t* __restrict__ nClustersInModule, // modified: number of clusters found in each module
uint32_t const* __restrict__ moduleId, // module id of each module
int32_t* __restrict__ clusterId, // modified: cluster id of each pixel
const uint32_t numElements) const {
const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
if (blockIdx >= moduleStart[0])
return;

auto firstPixel = moduleStart[1 + blockIdx];
auto thisModuleId = id[firstPixel];
assert(thisModuleId < MaxNumModules);
assert(thisModuleId == moduleId[blockIdx]);

auto nclus = nClustersInModule[thisModuleId];
if (nclus == 0)
return;

const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules)
printf("Warning too many clusters in module %d in block %d: %d > %d\n",
thisModuleId,
blockIdx,
nclus,
MaxNumClustersPerModules);

// Stride = block size.
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);

// Get thread / CPU element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel));

if (nclus > MaxNumClustersPerModules) {
uint32_t firstElementIdx = firstElementIdxNoStride[0u];
uint32_t endElementIdx = endElementIdxNoStride[0u];
// remove excess FIXME find a way to cut charge first....
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
if (clusterId[i] >= MaxNumClustersPerModules) {
id[i] = InvId;
clusterId[i] = InvId;
}
}
nclus = MaxNumClustersPerModules;
}

#ifdef GPU_DEBUG
if (thisModuleId % 100 == 1)
if (threadIdxLocal == 0)
printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx);
#endif

auto&& charge = alpaka::block::shared::st::allocVar<int32_t[MaxNumClustersPerModules], __COUNTER__>(acc);
auto&& ok = alpaka::block::shared::st::allocVar<uint8_t[MaxNumClustersPerModules], __COUNTER__>(acc);
auto&& newclusId = alpaka::block::shared::st::allocVar<uint16_t[MaxNumClustersPerModules], __COUNTER__>(acc);

assert(nclus <= MaxNumClustersPerModules);
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; });
alpaka::block::sync::syncBlockThreads(acc);

uint32_t firstElementIdx = firstElementIdxNoStride[0u];
uint32_t endElementIdx = endElementIdxNoStride[0u];
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &charge[clusterId[i]], static_cast<int32_t>(adc[i]));
}
alpaka::block::sync::syncBlockThreads(acc);

auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?)
cms::alpakatools::for_each_element_1D_block_stride(
acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; });
alpaka::block::sync::syncBlockThreads(acc);

// renumber
auto&& ws = alpaka::block::shared::st::allocVar<uint16_t[32], __COUNTER__>(acc);
cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws);

assert(nclus >= newclusId[nclus - 1]);

if (nclus == newclusId[nclus - 1])
return;

nClustersInModule[thisModuleId] = newclusId[nclus - 1];
alpaka::block::sync::syncBlockThreads(acc);

// mark bad cluster again
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) {
if (0 == ok[i])
newclusId[i] = InvId + 1;
});
alpaka::block::sync::syncBlockThreads(acc);

// reassign id
firstElementIdx = firstElementIdxNoStride[0u];
endElementIdx = endElementIdxNoStride[0u];
for (uint32_t i = firstElementIdx; i < numElements; ++i) {
if (!cms::alpakatools::get_next_element_1D_index_stride(
i, firstElementIdx, endElementIdx, blockDimension, numElements))
break;
if (id[i] == InvId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
clusterId[i] = newclusId[clusterId[i]] - 1;
if (clusterId[i] == InvId)
id[i] = InvId;
}

//done
}
};

} // namespace gpuClustering

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
Loading

0 comments on commit 171f24d

Please sign in to comment.