diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index cd1b3461c..f0ae89494 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -232,7 +232,10 @@ namespace cms { threadIdx < maxNumberOfElements; threadIdx += blockDimension, endElementIdx += blockDimension) { // (CPU) Loop on all elements. - for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { func(i); } } @@ -274,7 +277,10 @@ namespace cms { threadIdx < maxNumberOfElements; threadIdx += gridDimension, endElementIdx += gridDimension) { // (CPU) Loop on all elements. - for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) { + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (uint32_t i = threadIdx; i < endElementIdx; ++i) { func(i); } } @@ -291,6 +297,27 @@ namespace cms { cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func); } + /* + * Case where the input index has reached the end of threadDimension: strides the input index. + * Otherwise: do nothing. + */ + ALPAKA_FN_ACC bool get_next_element_1D_index_stride(uint32_t& i, + uint32_t& firstElementIdx, + uint32_t& endElementIdx, + const uint32_t stride, + const uint32_t maxNumberOfElements) { + bool isNextStrideElementValid = true; + if (i == endElementIdx) { + firstElementIdx += stride; + endElementIdx += stride; + i = firstElementIdx; + if (i >= maxNumberOfElements) { + isNextStrideElementValid = false; + } + } + return isNextStrideElementValid; + } + } // namespace alpakatools } // namespace cms diff --git a/src/alpaka/Geometry/phase1PixelTopology.h b/src/alpaka/Geometry/phase1PixelTopology.h new file mode 100644 index 000000000..409ebec3c --- /dev/null +++ b/src/alpaka/Geometry/phase1PixelTopology.h @@ -0,0 +1,174 @@ +#ifndef Geometry_TrackerGeometryBuilder_phase1PixelTopology_h +#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h + +#include +#include + +namespace phase1PixelTopology { + + constexpr uint16_t numRowsInRoc = 80; + constexpr uint16_t numColsInRoc = 52; + constexpr uint16_t lastRowInRoc = numRowsInRoc - 1; + constexpr uint16_t lastColInRoc = numColsInRoc - 1; + + constexpr uint16_t numRowsInModule = 2 * numRowsInRoc; + constexpr uint16_t numColsInModule = 8 * numColsInRoc; + constexpr uint16_t lastRowInModule = numRowsInModule - 1; + constexpr uint16_t lastColInModule = numColsInModule - 1; + + constexpr int16_t xOffset = -81; + constexpr int16_t yOffset = -54 * 4; + + constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule); + + constexpr uint32_t numberOfModules = 1856; + constexpr uint32_t numberOfLayers = 10; + constexpr uint32_t layerStart[numberOfLayers + 1] = {0, + 96, + 320, + 672, // barrel + 1184, + 1296, + 1408, // positive endcap + 1520, + 1632, + 1744, // negative endcap + numberOfModules}; + constexpr char const* layerName[numberOfLayers] = { + "BL1", + "BL2", + "BL3", + "BL4", // barrel + "E+1", + "E+2", + "E+3", // positive endcap + "E-1", + "E-2", + "E-3" // negative endcap + }; + + constexpr uint32_t numberOfModulesInBarrel = 1184; + constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8; + + template + constexpr auto map_to_array_helper(Function f, std::index_sequence) + -> std::array::type, sizeof...(Indices)> { + return {{f(Indices)...}}; + } + + template + constexpr auto map_to_array(Function f) -> std::array::type, N> { + return map_to_array_helper(f, std::make_index_sequence{}); + } + + constexpr uint32_t findMaxModuleStride() { + bool go = true; + int n = 2; + while (go) { + for (uint8_t i = 1; i < 11; ++i) { + if (layerStart[i] % n != 0) { + go = false; + break; + } + } + if (!go) + break; + n *= 2; + } + return n / 2; + } + + constexpr uint32_t maxModuleStride = findMaxModuleStride(); + + constexpr uint8_t findLayer(uint32_t detId) { + for (uint8_t i = 0; i < 11; ++i) + if (detId < layerStart[i + 1]) + return i; + return 11; + } + + constexpr uint8_t findLayerFromCompact(uint32_t detId) { + detId *= maxModuleStride; + for (uint8_t i = 0; i < 11; ++i) + if (detId < layerStart[i + 1]) + return i; + return 11; + } + + constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; + constexpr std::array layer = map_to_array(findLayerFromCompact); + + constexpr bool validateLayerIndex() { + bool res = true; + for (auto i = 0U; i < numberOfModules; ++i) { + auto j = i / maxModuleStride; + res &= (layer[j] < 10); + res &= (i >= layerStart[layer[j]]); + res &= (i < layerStart[layer[j] + 1]); + } + return res; + } + + static_assert(validateLayerIndex(), "layer from detIndex algo is buggy"); + + // this is for the ROC n<512 (upgrade 1024) + constexpr inline uint16_t divu52(uint16_t n) { + n = n >> 2; + uint16_t q = (n >> 1) + (n >> 4); + q = q + (q >> 4) + (q >> 5); + q = q >> 3; + uint16_t r = n - q * 13; + return q + ((r + 3) >> 4); + } + + constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); } + + constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); } + + constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; } + + constexpr inline uint16_t toRocY(uint16_t py) { + auto roc = divu52(py); + return py - 52 * roc; + } + + constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); } + + constexpr inline bool isBigPixY(uint16_t py) { + auto ly = toRocY(py); + return (ly == 0) | (ly == lastColInRoc); + } + + constexpr inline uint16_t localX(uint16_t px) { + auto shift = 0; + if (px > lastRowInRoc) + shift += 1; + if (px > numRowsInRoc) + shift += 1; + return px + shift; + } + + constexpr inline uint16_t localY(uint16_t py) { + auto roc = divu52(py); + auto shift = 2 * roc; + auto yInRoc = py - 52 * roc; + if (yInRoc > 0) + shift += 1; + return py + shift; + } + + //FIXME move it elsewhere? + struct AverageGeometry { + static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel; + float ladderZ[numberOfLaddersInBarrel]; + float ladderX[numberOfLaddersInBarrel]; + float ladderY[numberOfLaddersInBarrel]; + float ladderR[numberOfLaddersInBarrel]; + float ladderMinZ[numberOfLaddersInBarrel]; + float ladderMaxZ[numberOfLaddersInBarrel]; + float endCapZ[2]; // just for pos and neg Layer1 + }; + +} // namespace phase1PixelTopology + +#endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h diff --git a/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h b/src/alpaka/plugin-SiPixelClusterizer/clustering_alpaka.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h new file mode 100644 index 000000000..456a25fa8 --- /dev/null +++ b/src/alpaka/plugin-SiPixelClusterizer/gpuClusterChargeCut.h @@ -0,0 +1,147 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h +#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h + +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/prefixScan.h" +#include "AlpakaDataFormats/gpuClusteringConstants.h" + +namespace gpuClustering { + + struct clusterChargeCut { + template + ALPAKA_FN_ACC void operator()( + const T_Acc& acc, + uint16_t* __restrict__ id, // module id of each pixel (modified if bad cluster) + uint16_t const* __restrict__ adc, // charge of each pixel + uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module + uint32_t* __restrict__ nClustersInModule, // modified: number of clusters found in each module + uint32_t const* __restrict__ moduleId, // module id of each module + int32_t* __restrict__ clusterId, // modified: cluster id of each pixel + const uint32_t numElements) const { + const uint32_t blockIdx(alpaka::idx::getIdx(acc)[0u]); + if (blockIdx >= moduleStart[0]) + return; + + auto firstPixel = moduleStart[1 + blockIdx]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + assert(thisModuleId == moduleId[blockIdx]); + + auto nclus = nClustersInModule[thisModuleId]; + if (nclus == 0) + return; + + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + if (threadIdxLocal == 0 && nclus > MaxNumClustersPerModules) + printf("Warning too many clusters in module %d in block %d: %d > %d\n", + thisModuleId, + blockIdx, + nclus, + MaxNumClustersPerModules); + + // Stride = block size. + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Get thread / CPU element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel)); + + if (nclus > MaxNumClustersPerModules) { + uint32_t firstElementIdx = firstElementIdxNoStride[0u]; + uint32_t endElementIdx = endElementIdxNoStride[0u]; + // remove excess FIXME find a way to cut charge first.... + for (uint32_t i = firstElementIdx; i < numElements; ++i) { + if (!cms::alpakatools::get_next_element_1D_index_stride( + i, firstElementIdx, endElementIdx, blockDimension, numElements)) + break; + if (id[i] == InvId) + continue; // not valid + if (id[i] != thisModuleId) + break; // end of module + if (clusterId[i] >= MaxNumClustersPerModules) { + id[i] = InvId; + clusterId[i] = InvId; + } + } + nclus = MaxNumClustersPerModules; + } + +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdxLocal == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx); +#endif + + auto&& charge = alpaka::block::shared::st::allocVar(acc); + auto&& ok = alpaka::block::shared::st::allocVar(acc); + auto&& newclusId = alpaka::block::shared::st::allocVar(acc); + + assert(nclus <= MaxNumClustersPerModules); + cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { charge[i] = 0; }); + alpaka::block::sync::syncBlockThreads(acc); + + uint32_t firstElementIdx = firstElementIdxNoStride[0u]; + uint32_t endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t i = firstElementIdx; i < numElements; ++i) { + if (!cms::alpakatools::get_next_element_1D_index_stride( + i, firstElementIdx, endElementIdx, blockDimension, numElements)) + break; + if (id[i] == InvId) + continue; // not valid + if (id[i] != thisModuleId) + break; // end of module + alpaka::atomic::atomicOp(acc, &charge[clusterId[i]], static_cast(adc[i])); + } + alpaka::block::sync::syncBlockThreads(acc); + + auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?) + cms::alpakatools::for_each_element_1D_block_stride( + acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; }); + alpaka::block::sync::syncBlockThreads(acc); + + // renumber + auto&& ws = alpaka::block::shared::st::allocVar(acc); + cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws); + + assert(nclus >= newclusId[nclus - 1]); + + if (nclus == newclusId[nclus - 1]) + return; + + nClustersInModule[thisModuleId] = newclusId[nclus - 1]; + alpaka::block::sync::syncBlockThreads(acc); + + // mark bad cluster again + cms::alpakatools::for_each_element_1D_block_stride(acc, nclus, [&](uint32_t i) { + if (0 == ok[i]) + newclusId[i] = InvId + 1; + }); + alpaka::block::sync::syncBlockThreads(acc); + + // reassign id + firstElementIdx = firstElementIdxNoStride[0u]; + endElementIdx = endElementIdxNoStride[0u]; + for (uint32_t i = firstElementIdx; i < numElements; ++i) { + if (!cms::alpakatools::get_next_element_1D_index_stride( + i, firstElementIdx, endElementIdx, blockDimension, numElements)) + break; + if (id[i] == InvId) + continue; // not valid + if (id[i] != thisModuleId) + break; // end of module + clusterId[i] = newclusId[clusterId[i]] - 1; + if (clusterId[i] == InvId) + id[i] = InvId; + } + + //done + } + }; + +} // namespace gpuClustering + +#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h diff --git a/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h b/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h new file mode 100644 index 000000000..73eaae60d --- /dev/null +++ b/src/alpaka/plugin-SiPixelClusterizer/gpuClustering.h @@ -0,0 +1,350 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h +#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h + +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/HistoContainer.h" +#include "AlpakaDataFormats/gpuClusteringConstants.h" +#include "Geometry/phase1PixelTopology.h" + +namespace gpuClustering { + +#ifdef GPU_DEBUG + // move to ALPAKA_ACCELERATOR_NAMESPACE ? + ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0; +#endif + + struct countModules { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + uint16_t const* __restrict__ id, + uint32_t* __restrict__ moduleStart, + int32_t* __restrict__ clusterId, + const unsigned int numElements) const { + cms::alpakatools::for_each_element_1D_grid_stride(acc, numElements, [&](uint32_t i) { + clusterId[i] = i; + if (InvId != id[i]) { + int j = i - 1; + while (j >= 0 and id[j] == InvId) + --j; + if (j < 0 or id[j] != id[i]) { + // boundary... + constexpr auto max = MaxNumModules; + auto loc = alpaka::atomic::atomicOp(acc, moduleStart, max); + + moduleStart[loc + 1] = i; + } + } + }); + } + }; + + // __launch_bounds__(256,4) + struct findClus { + template + ALPAKA_FN_ACC void operator()( + const T_Acc& acc, + uint16_t const* __restrict__ id, // module id of each pixel + uint16_t const* __restrict__ x, // local coordinates of each pixel + uint16_t const* __restrict__ y, // + uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module + uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module + uint32_t* __restrict__ moduleId, // output: module id of each module + int32_t* __restrict__ clusterId, // output: cluster id of each pixel + const unsigned int numElements) const { + const uint32_t blockIdx(alpaka::idx::getIdx(acc)[0u]); + if (blockIdx >= moduleStart[0]) + return; + + auto firstPixel = moduleStart[1 + blockIdx]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + + const uint32_t threadIdxLocal(alpaka::idx::getIdx(acc)[0u]); + +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdxLocal == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx); +#endif + + // find the index of the first pixel not belonging to this module (or invalid) + auto&& msize = alpaka::block::shared::st::allocVar(acc); + msize = numElements; + alpaka::block::sync::syncBlockThreads(acc); + + // Stride = block size. + const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + + // Get thread / CPU element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + cms::alpakatools::element_index_range_in_block(acc, Vec1::all(firstPixel)); + uint32_t firstElementIdx = firstElementIdxNoStride[0u]; + uint32_t endElementIdx = endElementIdxNoStride[0u]; + + // skip threads not associated to an existing pixel + for (uint32_t i = firstElementIdx; i < numElements; ++i) { + if (!cms::alpakatools::get_next_element_1D_index_stride( + i, firstElementIdx, endElementIdx, blockDimension, numElements)) + break; + if (id[i] == InvId) // skip invalid pixels + continue; + if (id[i] != thisModuleId) { // find the first pixel in a different module + alpaka::atomic::atomicOp(acc, &msize, i); + break; + } + } + + //init hist (ymax=416 < 512 : 9bits) + constexpr uint32_t maxPixInModule = 4000; + constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; + using Hist = cms::alpakatools::HistoContainer; + auto&& hist = alpaka::block::shared::st::allocVar(acc); + auto&& ws = alpaka::block::shared::st::allocVar(acc); + + cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + alpaka::block::sync::syncBlockThreads(acc); + + assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId))); + + // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer) + if (0 == threadIdxLocal) { + if (msize - firstPixel > maxPixInModule) { + printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule); + msize = maxPixInModule + firstPixel; + } + } + + alpaka::block::sync::syncBlockThreads(acc); + assert(msize - firstPixel <= maxPixInModule); + +#ifdef GPU_DEBUG + auto&& totGood = alpaka::block::shared::st::allocVar(acc); + totGood = 0; + alpaka::block::sync::syncBlockThreads(acc); +#endif + + // fill histo + cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + hist.count(acc, y[i]); +#ifdef GPU_DEBUG + alpaka::atomic::atomicOp(acc, &totGood, 1u); +#endif + } + }); + alpaka::block::sync::syncBlockThreads(acc); + cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, 32u, [&](uint32_t i) { + ws[i] = 0; // used by prefix scan... + }); + alpaka::block::sync::syncBlockThreads(acc); + hist.finalize(acc, ws); + alpaka::block::sync::syncBlockThreads(acc); +#ifdef GPU_DEBUG + assert(hist.size() == totGood); + if (thisModuleId % 100 == 1) + if (threadIdxLocal == 0) + printf("histo size %d\n", hist.size()); +#endif + cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + hist.fill(acc, y[i], i - firstPixel); + } + }); + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // assume that we can cover the whole module with up to 16 blockDimension-wide iterations + constexpr unsigned int maxiter = 16; +#else + const unsigned int maxiter = + hist.size(); // After change of threadDimension, should be changed to hist.size() / blockDimension. +#endif + + // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event + constexpr int maxNeighbours = 10; + assert((hist.size() / blockDimension) <= maxiter); + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + constexpr uint32_t threadDimension = 1; +#else + // For now, match legacy, for perf comparison purposes. After fixes in perf, this should be tuned. + constexpr uint32_t threadDimension = 1; +#endif + const uint32_t runTimeThreadDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); + assert(runTimeThreadDimension <= threadDimension); + + // nearest neighbour + uint16_t nn[maxiter][threadDimension][maxNeighbours]; + uint8_t nnn[maxiter][threadDimension]; // number of nn + for (uint32_t elementIdx = 0; elementIdx < threadDimension; ++elementIdx) { + for (uint32_t k = 0; k < maxiter; ++k) { + nnn[k][elementIdx] = 0; + } + } + + alpaka::block::sync::syncBlockThreads(acc); // for hit filling! + +#ifdef GPU_DEBUG + // look for anomalous high occupancy + auto&& n40 = alpaka::block::shared::st::allocVar(acc); + auto&& n60 = alpaka::block::shared::st::allocVar(acc); + n40 = n60 = 0; + alpaka::block::sync::syncBlockThreads(acc); + cms::alpakatools::for_each_element_1D_block_stride(acc, Hist::nbins(), [&](uint32_t j) { + if (hist.size(j) > 60) + alpaka::atomic::atomicOp(acc, &n60, 1u); + if (hist.size(j) > 40) + alpaka::atomic::atomicOp(acc, &n40, 1u); + }); + alpaka::block::sync::syncBlockThreads(acc); + if (0 == threadIdxLocal) { + if (n60 > 0) + printf("columns with more than 60 px %d in %d\n", n60, thisModuleId); + else if (n40 > 0) + printf("columns with more than 40 px %d in %d\n", n40, thisModuleId); + } + alpaka::block::sync::syncBlockThreads(acc); +#endif + + // fill NN + uint32_t k = 0u; + cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) { + const uint32_t jEquivalentClass = j % threadDimension; + k = j / blockDimension; + assert(k < maxiter); + auto p = hist.begin() + j; + auto i = *p + firstPixel; + assert(id[i] != InvId); + assert(id[i] == thisModuleId); // same module + int be = Hist::bin(y[i] + 1); + auto e = hist.end(be); + ++p; + assert(0 == nnn[k][jEquivalentClass]); + for (; p < e; ++p) { + auto m = (*p) + firstPixel; + assert(m != i); + assert(int(y[m]) - int(y[i]) >= 0); + assert(int(y[m]) - int(y[i]) <= 1); + if (std::abs(int(x[m]) - int(x[i])) <= 1) { + auto l = nnn[k][jEquivalentClass]++; + assert(l < maxNeighbours); + nn[k][jEquivalentClass][l] = *p; + } + } + }); + + // for each pixel, look at all the pixels until the end of the module; + // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; + // after the loop, all the pixel in each cluster should have the id equeal to the lowest + // pixel in the cluster ( clus[i] == i ). + bool more = true; + int nloops = 0; + while (alpaka::block::sync::syncBlockThreadsPredicate(acc, more)) { + if (1 == nloops % 2) { + cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) { + auto p = hist.begin() + j; + auto i = *p + firstPixel; + auto m = clusterId[i]; + while (m != clusterId[m]) + m = clusterId[m]; + clusterId[i] = m; + }); + } else { + more = false; + uint32_t k = 0u; + cms::alpakatools::for_each_element_1D_block_stride(acc, hist.size(), [&](uint32_t j) { + k = j / blockDimension; + const uint32_t jEquivalentClass = j % threadDimension; + auto p = hist.begin() + j; + auto i = *p + firstPixel; + for (int kk = 0; kk < nnn[k][jEquivalentClass]; ++kk) { + auto l = nn[k][jEquivalentClass][kk]; + auto m = l + firstPixel; + assert(m != i); + auto old = alpaka::atomic::atomicOp(acc, &clusterId[m], clusterId[i]); + if (old != clusterId[i]) { + // end the loop only if no changes were applied + more = true; + } + alpaka::atomic::atomicOp(acc, &clusterId[i], old); + } // nnloop + }); // pixel loop + } + ++nloops; + } // end while + +#ifdef GPU_DEBUG + { + auto&& n0 = alpaka::block::shared::st::allocVar(acc); + if (threadIdxLocal == 0) + n0 = nloops; + alpaka::block::sync::syncBlockThreads(acc); + auto ok = n0 == nloops; + assert(alpaka::block::sync::syncBlockThreadsPredicate(acc, ok)); + if (thisModuleId % 100 == 1) + if (threadIdxLocal == 0) + printf("# loops %d\n", nloops); + } +#endif + + auto&& foundClusters = alpaka::block::shared::st::allocVar(acc); + foundClusters = 0; + alpaka::block::sync::syncBlockThreads(acc); + + // find the number of different clusters, identified by a pixels with clus[i] == i; + // mark these pixels with a negative id. + cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + if (clusterId[i] == static_cast(i)) { + auto old = alpaka::atomic::atomicOp(acc, &foundClusters, 0xffffffff); + clusterId[i] = -(old + 1); + } + } + }); + alpaka::block::sync::syncBlockThreads(acc); + + // propagate the negative id to all the pixels in the cluster. + cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + if (clusterId[i] >= 0) { + // mark each pixel in a cluster with the same id as the first one + clusterId[i] = clusterId[clusterId[i]]; + } + } + }); + alpaka::block::sync::syncBlockThreads(acc); + + // adjust the cluster id to be a positive value starting from 0 + cms::alpakatools::for_each_element_1D_block_stride(acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] == InvId) { // skip invalid pixels + clusterId[i] = -9999; + } else { + clusterId[i] = -clusterId[i] - 1; + } + }); + alpaka::block::sync::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + nClustersInModule[thisModuleId] = foundClusters; + moduleId[blockIdx] = thisModuleId; +#ifdef GPU_DEBUG + if (foundClusters > gMaxHit) { + gMaxHit = foundClusters; + if (foundClusters > 8) + printf("max hit %d in %d\n", foundClusters, thisModuleId); + } +#endif +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + printf("%d clusters in module %d\n", foundClusters, thisModuleId); +#endif + } + } + }; + +} // namespace gpuClustering + +#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h diff --git a/src/alpaka/test/alpaka/clustering_alpaka.cc b/src/alpaka/test/alpaka/clustering_alpaka.cc deleted file mode 100644 index 4f4982afa..000000000 --- a/src/alpaka/test/alpaka/clustering_alpaka.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include -#include "AlpakaCore/alpakaConfig.h" - -int main() { - std::cout << "Exit success" << std::endl; - - return 0; -} diff --git a/src/alpaka/test/alpaka/clustering_t.cc b/src/alpaka/test/alpaka/clustering_t.cc new file mode 100644 index 000000000..295ecccb3 --- /dev/null +++ b/src/alpaka/test/alpaka/clustering_t.cc @@ -0,0 +1,387 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +// dirty, but works +#include "plugin-SiPixelClusterizer/gpuClustering.h" +#include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h" + +int main(void) { + using namespace gpuClustering; + + const DevHost host(alpaka::pltf::getDevByIdx(0u)); + const ALPAKA_ACCELERATOR_NAMESPACE::DevAcc1 device( + alpaka::pltf::getDevByIdx(0u)); + ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); + + constexpr unsigned int numElements = 256 * 2000; + // these in reality are already on GPU + auto h_id_buf = alpaka::mem::buf::alloc(host, numElements); + auto h_id = alpaka::mem::view::getPtrNative(h_id_buf); + auto h_x_buf = alpaka::mem::buf::alloc(host, numElements); + auto h_x = alpaka::mem::view::getPtrNative(h_x_buf); + auto h_y_buf = alpaka::mem::buf::alloc(host, numElements); + auto h_y = alpaka::mem::view::getPtrNative(h_y_buf); + auto h_adc_buf = alpaka::mem::buf::alloc(host, numElements); + auto h_adc = alpaka::mem::view::getPtrNative(h_adc_buf); + + auto h_clus_buf = alpaka::mem::buf::alloc(host, numElements); + auto h_clus = alpaka::mem::view::getPtrNative(h_clus_buf); + + auto d_id_buf = alpaka::mem::buf::alloc(device, numElements); + auto d_x_buf = alpaka::mem::buf::alloc(device, numElements); + auto d_y_buf = alpaka::mem::buf::alloc(device, numElements); + auto d_adc_buf = alpaka::mem::buf::alloc(device, numElements); + auto d_clus_buf = alpaka::mem::buf::alloc(device, numElements); + + auto d_moduleStart_buf = alpaka::mem::buf::alloc(device, MaxNumModules + 1); + auto d_clusInModule_buf = alpaka::mem::buf::alloc(device, MaxNumModules); + auto d_moduleId_buf = alpaka::mem::buf::alloc(device, MaxNumModules); + + // later random number + unsigned int n = 0; + int ncl = 0; + int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6}; + + auto generateClusters = [&](int kn) { + auto addBigNoise = 1 == kn % 2; + if (addBigNoise) { + constexpr int MaxPixels = 1000; + int id = 666; + for (int x = 0; x < 140; x += 3) { + for (int yy = 0; yy < 400; yy += 3) { + h_id[n] = id; + h_x[n] = x; + h_y[n] = yy; + h_adc[n] = 1000; + ++n; + ++ncl; + if (MaxPixels <= ncl) + break; + } + if (MaxPixels <= ncl) + break; + } + } + + { + // isolated + int id = 42; + int x = 10; + ++ncl; + h_id[n] = id; + h_x[n] = x; + h_y[n] = x; + h_adc[n] = kn == 0 ? 100 : 5000; + ++n; + + // first column + ++ncl; + h_id[n] = id; + h_x[n] = x; + h_y[n] = 0; + h_adc[n] = 5000; + ++n; + // first columns + ++ncl; + h_id[n] = id; + h_x[n] = x + 80; + h_y[n] = 2; + h_adc[n] = 5000; + ++n; + h_id[n] = id; + h_x[n] = x + 80; + h_y[n] = 1; + h_adc[n] = 5000; + ++n; + + // last column + ++ncl; + h_id[n] = id; + h_x[n] = x; + h_y[n] = 415; + h_adc[n] = 5000; + ++n; + // last columns + ++ncl; + h_id[n] = id; + h_x[n] = x + 80; + h_y[n] = 415; + h_adc[n] = 2500; + ++n; + h_id[n] = id; + h_x[n] = x + 80; + h_y[n] = 414; + h_adc[n] = 2500; + ++n; + + // diagonal + ++ncl; + for (int x = 20; x < 25; ++x) { + h_id[n] = id; + h_x[n] = x; + h_y[n] = x; + h_adc[n] = 1000; + ++n; + } + ++ncl; + // reversed + for (int x = 45; x > 40; --x) { + h_id[n] = id; + h_x[n] = x; + h_y[n] = x; + h_adc[n] = 1000; + ++n; + } + ++ncl; + h_id[n++] = InvId; // error + // messy + int xx[5] = {21, 25, 23, 24, 22}; + for (int k = 0; k < 5; ++k) { + h_id[n] = id; + h_x[n] = xx[k]; + h_y[n] = 20 + xx[k]; + h_adc[n] = 1000; + ++n; + } + // holes + ++ncl; + for (int k = 0; k < 5; ++k) { + h_id[n] = id; + h_x[n] = xx[k]; + h_y[n] = 100; + h_adc[n] = kn == 2 ? 100 : 1000; + ++n; + if (xx[k] % 2 == 0) { + h_id[n] = id; + h_x[n] = xx[k]; + h_y[n] = 101; + h_adc[n] = 1000; + ++n; + } + } + } + { + // id == 0 (make sure it works! + int id = 0; + int x = 10; + ++ncl; + h_id[n] = id; + h_x[n] = x; + h_y[n] = x; + h_adc[n] = 5000; + ++n; + } + // all odd id + for (int id = 11; id <= 1800; id += 2) { + if ((id / 20) % 2) + h_id[n++] = InvId; // error + for (int x = 0; x < 40; x += 4) { + ++ncl; + if ((id / 10) % 2) { + for (int k = 0; k < 10; ++k) { + h_id[n] = id; + h_x[n] = x; + h_y[n] = x + y[k]; + h_adc[n] = 100; + ++n; + h_id[n] = id; + h_x[n] = x + 1; + h_y[n] = x + y[k] + 2; + h_adc[n] = 1000; + ++n; + } + } else { + for (int k = 0; k < 10; ++k) { + h_id[n] = id; + h_x[n] = x; + h_y[n] = x + y[9 - k]; + h_adc[n] = kn == 2 ? 10 : 1000; + ++n; + if (y[k] == 3) + continue; // hole + if (id == 51) { + h_id[n++] = InvId; + h_id[n++] = InvId; + } // error + h_id[n] = id; + h_x[n] = x + 1; + h_y[n] = x + y[k] + 2; + h_adc[n] = kn == 2 ? 10 : 1000; + ++n; + } + } + } + } + }; // end lambda + for (auto kkk = 0; kkk < 5; ++kkk) { + n = 0; + ncl = 0; + generateClusters(kkk); + + std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl; + assert(n <= numElements); + + auto h_nModules_buf = alpaka::mem::buf::alloc(host, 1u); + auto nModules = alpaka::mem::view::getPtrNative(h_nModules_buf); + nModules[0] = 0; + alpaka::mem::view::copy(queue, d_moduleStart_buf, h_nModules_buf, 1u); + + alpaka::mem::view::copy(queue, d_id_buf, h_id_buf, n); + alpaka::mem::view::copy(queue, d_x_buf, h_x_buf, n); + alpaka::mem::view::copy(queue, d_y_buf, h_y_buf, n); + alpaka::mem::view::copy(queue, d_adc_buf, h_adc_buf, n); + + // Launch CUDA Kernels +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + const int threadsPerBlockOrElementsPerThread = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); +#else + // For now, match legacy, for perf comparison purposes. After fixes in perf, this should be tuned. + const int threadsPerBlockOrElementsPerThread = 1; +#endif + + // COUNT MODULES +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + const int blocksPerGridCountModules = + (numElements + threadsPerBlockOrElementsPerThread - 1) / threadsPerBlockOrElementsPerThread; +#else + const int blocksPerGridCountModules = 1; +#endif + const WorkDiv1& workDivCountModules = cms::alpakatools::make_workdiv(Vec1::all(blocksPerGridCountModules), + Vec1::all(threadsPerBlockOrElementsPerThread)); + std::cout << "CUDA countModules kernel launch with " << blocksPerGridCountModules << " blocks of " + << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n"; + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDivCountModules, + countModules(), + alpaka::mem::view::getPtrNative(d_id_buf), + alpaka::mem::view::getPtrNative(d_moduleStart_buf), + alpaka::mem::view::getPtrNative(d_clus_buf), + n)); + + // FIND CLUSTER + const WorkDiv1& workDivMaxNumModules = + cms::alpakatools::make_workdiv(Vec1::all(MaxNumModules), Vec1::all(threadsPerBlockOrElementsPerThread)); + std::cout << "CUDA findModules kernel launch with " << MaxNumModules << " blocks of " + << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n"; + + alpaka::mem::view::set(queue, d_clusInModule_buf, 0, MaxNumModules); + + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDivMaxNumModules, + findClus(), + alpaka::mem::view::getPtrNative(d_id_buf), + alpaka::mem::view::getPtrNative(d_x_buf), + alpaka::mem::view::getPtrNative(d_y_buf), + alpaka::mem::view::getPtrNative(d_moduleStart_buf), + alpaka::mem::view::getPtrNative(d_clusInModule_buf), + alpaka::mem::view::getPtrNative(d_moduleId_buf), + alpaka::mem::view::getPtrNative(d_clus_buf), + n)); + alpaka::mem::view::copy(queue, h_nModules_buf, d_moduleStart_buf, 1u); + + auto h_nclus_buf = alpaka::mem::buf::alloc(host, MaxNumModules); + auto nclus = alpaka::mem::view::getPtrNative(h_nclus_buf); + alpaka::mem::view::copy(queue, h_nclus_buf, d_clusInModule_buf, MaxNumModules); + + // Wait for memory transfers to be completed + alpaka::wait::wait(queue); + + auto h_moduleId_buf = alpaka::mem::buf::alloc(host, nModules[0]); + //auto moduleId = alpaka::mem::view::getPtrNative(h_moduleId_buf); + + std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters" + << std::endl; + for (auto i = MaxNumModules; i > 0; i--) + if (nclus[i - 1] > 0) { + std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl; + break; + } + if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0)) + std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; + + // CLUSTER CHARGE CUT + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel( + workDivMaxNumModules, + clusterChargeCut(), + alpaka::mem::view::getPtrNative(d_id_buf), + alpaka::mem::view::getPtrNative(d_adc_buf), + alpaka::mem::view::getPtrNative(d_moduleStart_buf), + alpaka::mem::view::getPtrNative(d_clusInModule_buf), + alpaka::mem::view::getPtrNative(d_moduleId_buf), + alpaka::mem::view::getPtrNative(d_clus_buf), + n)); + alpaka::mem::view::copy(queue, h_id_buf, d_id_buf, n); + alpaka::mem::view::copy(queue, h_clus_buf, d_clus_buf, n); + alpaka::mem::view::copy(queue, h_nclus_buf, d_clusInModule_buf, MaxNumModules); + alpaka::mem::view::copy(queue, h_moduleId_buf, d_moduleId_buf, nModules[0]); + + // Wait for memory transfers to be completed + alpaka::wait::wait(queue); + std::cout << "found " << nModules[0] << " Modules active" << std::endl; + + // CROSS-CHECK + std::set clids; + for (unsigned int i = 0; i < n; ++i) { + assert(h_id[i] != 666); // only noise + if (h_id[i] == InvId) + continue; + assert(h_clus[i] >= 0); + assert(h_clus[i] < int(nclus[h_id[i]])); + clids.insert(h_id[i] * 1000 + h_clus[i]); + // clids.insert(h_clus[i]); + } + + // verify no hole in numbering + auto p = clids.begin(); + auto cmid = (*p) / 1000; + assert(0 == (*p) % 1000); + auto c = p; + ++c; + std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl; + std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl; + for (; c != clids.end(); ++c) { + auto cc = *c; + auto pp = *p; + auto mid = cc / 1000; + auto pnc = pp % 1000; + auto nc = cc % 1000; + if (mid != cmid) { + assert(0 == cc % 1000); + assert(nclus[cmid] - 1 == pp % 1000); + // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": " << nclus[mid] << ' ' << pp << std::endl; + cmid = mid; + p = c; + continue; + } + p = c; + // assert(nc==pnc+1); + if (nc != pnc + 1) + std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl; + } + + std::cout << "found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << ' ' << clids.size() << " clusters" + << std::endl; + for (auto i = MaxNumModules; i > 0; i--) + if (nclus[i - 1] > 0) { + std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl; + break; + } + // << " and " << seeds.size() << " seeds" << std::endl; + } /// end loop kkk + return 0; +}