-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[alpaka] Port Clusterizer to Alpaka #172
Changes from all commits
6c3315a
8e829ef
10c30fd
9a5bf29
7fa84a1
4157e45
baa89ea
fc7820f
2bf4318
5e90ffd
7f82936
56aa5f4
58ba6ad
8baaa52
e92a8ed
9513add
8dfc6d5
6bab054
1dc7e39
f64f181
6b16234
7e52389
153235a
08cfa6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h | ||
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h | ||
|
||
#include <cstdint> | ||
#include <array> | ||
|
||
namespace phase1PixelTopology { | ||
|
||
constexpr uint16_t numRowsInRoc = 80; | ||
constexpr uint16_t numColsInRoc = 52; | ||
constexpr uint16_t lastRowInRoc = numRowsInRoc - 1; | ||
constexpr uint16_t lastColInRoc = numColsInRoc - 1; | ||
|
||
constexpr uint16_t numRowsInModule = 2 * numRowsInRoc; | ||
constexpr uint16_t numColsInModule = 8 * numColsInRoc; | ||
constexpr uint16_t lastRowInModule = numRowsInModule - 1; | ||
constexpr uint16_t lastColInModule = numColsInModule - 1; | ||
|
||
constexpr int16_t xOffset = -81; | ||
constexpr int16_t yOffset = -54 * 4; | ||
|
||
constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule); | ||
|
||
constexpr uint32_t numberOfModules = 1856; | ||
constexpr uint32_t numberOfLayers = 10; | ||
constexpr uint32_t layerStart[numberOfLayers + 1] = {0, | ||
96, | ||
320, | ||
672, // barrel | ||
1184, | ||
1296, | ||
1408, // positive endcap | ||
1520, | ||
1632, | ||
1744, // negative endcap | ||
numberOfModules}; | ||
constexpr char const* layerName[numberOfLayers] = { | ||
"BL1", | ||
"BL2", | ||
"BL3", | ||
"BL4", // barrel | ||
"E+1", | ||
"E+2", | ||
"E+3", // positive endcap | ||
"E-1", | ||
"E-2", | ||
"E-3" // negative endcap | ||
}; | ||
|
||
constexpr uint32_t numberOfModulesInBarrel = 1184; | ||
constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8; | ||
|
||
template <class Function, std::size_t... Indices> | ||
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>) | ||
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)> { | ||
return {{f(Indices)...}}; | ||
} | ||
|
||
template <int N, class Function> | ||
constexpr auto map_to_array(Function f) -> std::array<typename std::result_of<Function(std::size_t)>::type, N> { | ||
return map_to_array_helper(f, std::make_index_sequence<N>{}); | ||
} | ||
|
||
constexpr uint32_t findMaxModuleStride() { | ||
bool go = true; | ||
int n = 2; | ||
while (go) { | ||
for (uint8_t i = 1; i < 11; ++i) { | ||
if (layerStart[i] % n != 0) { | ||
go = false; | ||
break; | ||
} | ||
} | ||
if (!go) | ||
break; | ||
n *= 2; | ||
} | ||
return n / 2; | ||
} | ||
|
||
constexpr uint32_t maxModuleStride = findMaxModuleStride(); | ||
|
||
constexpr uint8_t findLayer(uint32_t detId) { | ||
for (uint8_t i = 0; i < 11; ++i) | ||
if (detId < layerStart[i + 1]) | ||
return i; | ||
return 11; | ||
} | ||
|
||
constexpr uint8_t findLayerFromCompact(uint32_t detId) { | ||
detId *= maxModuleStride; | ||
for (uint8_t i = 0; i < 11; ++i) | ||
if (detId < layerStart[i + 1]) | ||
return i; | ||
return 11; | ||
} | ||
|
||
constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; | ||
constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact); | ||
|
||
constexpr bool validateLayerIndex() { | ||
bool res = true; | ||
for (auto i = 0U; i < numberOfModules; ++i) { | ||
auto j = i / maxModuleStride; | ||
res &= (layer[j] < 10); | ||
res &= (i >= layerStart[layer[j]]); | ||
res &= (i < layerStart[layer[j] + 1]); | ||
} | ||
return res; | ||
} | ||
|
||
static_assert(validateLayerIndex(), "layer from detIndex algo is buggy"); | ||
|
||
// this is for the ROC n<512 (upgrade 1024) | ||
constexpr inline uint16_t divu52(uint16_t n) { | ||
n = n >> 2; | ||
uint16_t q = (n >> 1) + (n >> 4); | ||
q = q + (q >> 4) + (q >> 5); | ||
q = q >> 3; | ||
uint16_t r = n - q * 13; | ||
return q + ((r + 3) >> 4); | ||
} | ||
|
||
constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); } | ||
|
||
constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); } | ||
|
||
constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; } | ||
|
||
constexpr inline uint16_t toRocY(uint16_t py) { | ||
auto roc = divu52(py); | ||
return py - 52 * roc; | ||
} | ||
|
||
constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); } | ||
|
||
constexpr inline bool isBigPixY(uint16_t py) { | ||
auto ly = toRocY(py); | ||
return (ly == 0) | (ly == lastColInRoc); | ||
} | ||
|
||
constexpr inline uint16_t localX(uint16_t px) { | ||
auto shift = 0; | ||
if (px > lastRowInRoc) | ||
shift += 1; | ||
if (px > numRowsInRoc) | ||
shift += 1; | ||
return px + shift; | ||
} | ||
|
||
constexpr inline uint16_t localY(uint16_t py) { | ||
auto roc = divu52(py); | ||
auto shift = 2 * roc; | ||
auto yInRoc = py - 52 * roc; | ||
if (yInRoc > 0) | ||
shift += 1; | ||
return py + shift; | ||
} | ||
|
||
//FIXME move it elsewhere? | ||
struct AverageGeometry { | ||
static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel; | ||
float ladderZ[numberOfLaddersInBarrel]; | ||
float ladderX[numberOfLaddersInBarrel]; | ||
float ladderY[numberOfLaddersInBarrel]; | ||
float ladderR[numberOfLaddersInBarrel]; | ||
float ladderMinZ[numberOfLaddersInBarrel]; | ||
float ladderMaxZ[numberOfLaddersInBarrel]; | ||
float endCapZ[2]; // just for pos and neg Layer1 | ||
}; | ||
|
||
} // namespace phase1PixelTopology | ||
|
||
#endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h | ||
#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h | ||
|
||
#include <cstdint> | ||
#include <cstdio> | ||
|
||
#include "AlpakaCore/alpakaConfig.h" | ||
#include "AlpakaCore/alpakaWorkDivHelper.h" | ||
#include "AlpakaCore/prefixScan.h" | ||
#include "AlpakaDataFormats/gpuClusteringConstants.h" | ||
|
||
namespace gpuClustering { | ||
|
||
struct clusterChargeCut { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()( | ||
const T_Acc& acc, | ||
uint16_t* __restrict__ id, // module id of each pixel (modified if bad cluster) | ||
uint16_t const* __restrict__ adc, // charge of each pixel | ||
uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module | ||
uint32_t* __restrict__ nClustersInModule, // modified: number of clusters found in each module | ||
uint32_t const* __restrict__ moduleId, // module id of each module | ||
int32_t* __restrict__ clusterId, // modified: cluster id of each pixel | ||
const uint32_t numElements) const { | ||
const uint32_t blockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]); | ||
if (blockIdx >= moduleStart[0]) | ||
return; | ||
|
||
auto firstPixel = moduleStart[1 + blockIdx]; | ||
auto thisModuleId = id[firstPixel]; | ||
assert(thisModuleId < MaxNumModules); | ||
assert(thisModuleId == moduleId[blockIdx]); | ||
|
||
auto nclus = nClustersInModule[thisModuleId]; | ||
if (nclus == 0) | ||
return; | ||
|
||
const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]); | ||
if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules) | ||
printf("Warning too many clusters in module %d in block %d: %d > %d\n", | ||
thisModuleId, | ||
blockIdx, | ||
nclus, | ||
MaxNumClustersPerModules); | ||
|
||
// Stride = block size. | ||
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]); | ||
|
||
// Get thread / CPU element indices in block. | ||
const auto& [firstElementIdxNoStride, endElementIdxNoStride] = | ||
cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel)); | ||
|
||
if (nclus > MaxNumClustersPerModules) { | ||
uint32_t firstElementIdx = firstElementIdxNoStride[0u]; | ||
uint32_t endElementIdx = endElementIdxNoStride[0u]; | ||
// remove excess FIXME find a way to cut charge first.... | ||
for (uint32_t i = firstElementIdx; i < numElements; ++i) { | ||
if (!cms::alpakatools::get_next_element_1D_index_stride( | ||
i, firstElementIdx, endElementIdx, blockDimension, numElements)) | ||
break; | ||
if (id[i] == InvId) | ||
continue; // not valid | ||
if (id[i] != thisModuleId) | ||
break; // end of module | ||
if (clusterId[i] >= MaxNumClustersPerModules) { | ||
id[i] = InvId; | ||
clusterId[i] = InvId; | ||
} | ||
} | ||
nclus = MaxNumClustersPerModules; | ||
} | ||
|
||
#ifdef GPU_DEBUG | ||
if (thisModuleId % 100 == 1) | ||
if (threadIdxLocal == 0) | ||
printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx); | ||
#endif | ||
|
||
auto&& charge = alpaka::block::shared::st::allocVar<int32_t[MaxNumClustersPerModules], __COUNTER__>(acc); | ||
auto&& ok = alpaka::block::shared::st::allocVar<uint8_t[MaxNumClustersPerModules], __COUNTER__>(acc); | ||
auto&& newclusId = alpaka::block::shared::st::allocVar<uint16_t[MaxNumClustersPerModules], __COUNTER__>(acc); | ||
|
||
assert(nclus <= MaxNumClustersPerModules); | ||
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; }); | ||
alpaka::block::sync::syncBlockThreads(acc); | ||
|
||
uint32_t firstElementIdx = firstElementIdxNoStride[0u]; | ||
uint32_t endElementIdx = endElementIdxNoStride[0u]; | ||
for (uint32_t i = firstElementIdx; i < numElements; ++i) { | ||
if (!cms::alpakatools::get_next_element_1D_index_stride( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the main reason for this function (instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes the issue here was that 2 loops are introduced with Alpaka: one for the strided access, and an inner one for the CPU elements. Whether using a helper function and a lambda, or directly doing the looping on call site, one faces this 2-loops situation. This helper function allows to have the same logic as 2 loops, but with only 1 loop and no extra storage, which makes the portability of the legacy code easier: nothing has to be changed regarding break statements on call site and they can be kept 1:1. |
||
i, firstElementIdx, endElementIdx, blockDimension, numElements)) | ||
break; | ||
if (id[i] == InvId) | ||
continue; // not valid | ||
if (id[i] != thisModuleId) | ||
break; // end of module | ||
alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &charge[clusterId[i]], static_cast<int32_t>(adc[i])); | ||
} | ||
alpaka::block::sync::syncBlockThreads(acc); | ||
|
||
auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?) | ||
cms::alpakatools::for_each_element_1D_block_stride( | ||
acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; }); | ||
alpaka::block::sync::syncBlockThreads(acc); | ||
|
||
// renumber | ||
auto&& ws = alpaka::block::shared::st::allocVar<uint16_t[32], __COUNTER__>(acc); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The 32 here is related to the warp size, right? We should probably look later on how to make that dependent on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes totally. |
||
cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws); | ||
|
||
assert(nclus >= newclusId[nclus - 1]); | ||
|
||
if (nclus == newclusId[nclus - 1]) | ||
return; | ||
|
||
nClustersInModule[thisModuleId] = newclusId[nclus - 1]; | ||
alpaka::block::sync::syncBlockThreads(acc); | ||
|
||
// mark bad cluster again | ||
cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { | ||
if (0 == ok[i]) | ||
newclusId[i] = InvId + 1; | ||
}); | ||
alpaka::block::sync::syncBlockThreads(acc); | ||
|
||
// reassign id | ||
firstElementIdx = firstElementIdxNoStride[0u]; | ||
endElementIdx = endElementIdxNoStride[0u]; | ||
for (uint32_t i = firstElementIdx; i < numElements; ++i) { | ||
if (!cms::alpakatools::get_next_element_1D_index_stride( | ||
i, firstElementIdx, endElementIdx, blockDimension, numElements)) | ||
break; | ||
if (id[i] == InvId) | ||
continue; // not valid | ||
if (id[i] != thisModuleId) | ||
break; // end of module | ||
clusterId[i] = newclusId[clusterId[i]] - 1; | ||
if (clusterId[i] == InvId) | ||
id[i] = InvId; | ||
} | ||
|
||
//done | ||
} | ||
}; | ||
|
||
} // namespace gpuClustering | ||
|
||
#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would have kept
endElementIdx = std::min(endElementIdx, maxNumberOfElements)
, but probably doesn't make much difference.