diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h index 4a2d9e72e9366..1674b56c2e97a 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h @@ -130,6 +130,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { template struct FindClus { + // assume that we can cover the whole module with up to 16 blockDimension-wide iterations + static constexpr uint32_t maxIterGPU = 16; + + // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size + static constexpr uint32_t maxElementsPerBlock = + cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128); + template ALPAKA_FN_ACC void operator()(const TAcc& acc, SiPixelDigisSoAView digi_view, @@ -292,17 +299,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { #endif [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv(acc)[0u]; - // Assume that we can cover the whole module with up to 16 blockDimension-wide iterations - // This maxIter value was tuned for GPU, with 256 or 512 threads per block. - // Hence, also works for CPU case, with 256 or 512 elements per thread. - // Real constrainst is maxIter = hist.size() / blockDimension, - // with blockDimension = threadPerBlock * elementsPerThread. - // Hence, maxIter can be tuned accordingly to the workdiv. - constexpr unsigned int maxIterGPU = 16; + // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU); - // NB: can be tuned. - constexpr uint32_t maxElements = cms::alpakatools::requires_single_thread_per_block_v ? 256 : 1; + // number of elements per thread + constexpr uint32_t maxElements = + cms::alpakatools::requires_single_thread_per_block_v ? maxElementsPerBlock : 1; ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0u] <= maxElements)); constexpr unsigned int maxIter = maxIterGPU * maxElements; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc index 13b971753bd75..f54c9e9af29ec 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc @@ -641,25 +641,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u); alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement); - // TODO - // - we are fixing this here since it is used at compile time also in the kernel - // - put maxIter in the Geometry traits - constexpr auto threadsOrElementsFindClus = 256; - + const auto elementsPerBlockFindClus = FindClus::maxElementsPerBlock; const auto workDivMaxNumModules = - cms::alpakatools::make_workdiv(numberOfModules, threadsOrElementsFindClus); - // NB: With present FindClus() / chargeCut() algorithm, - // threadPerBlock (GPU) or elementsPerThread (CPU) = 256 show optimal performance. - // Though, it does not have to be the same number for CPU/GPU cases. - + cms::alpakatools::make_workdiv(numberOfModules, elementsPerBlockFindClus); #ifdef GPU_DEBUG - std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << threadsOrElementsFindClus + std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus << " threadsPerBlockOrElementsPerThread\n"; #endif - alpaka::exec( queue, workDivMaxNumModules, FindClus{}, digis_d->view(), clusters_d->view(), wordCounter); - #ifdef GPU_DEBUG alpaka::wait(queue); #endif @@ -740,14 +730,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u); alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement); - /// should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel) - - const auto threadsPerBlockFindClus = 256; - const auto workDivMaxNumModules = cms::alpakatools::make_workdiv(numberOfModules, threadsPerBlockFindClus); - + const auto elementsPerBlockFindClus = FindClus::maxElementsPerBlock; + const auto workDivMaxNumModules = + cms::alpakatools::make_workdiv(numberOfModules, elementsPerBlockFindClus); #ifdef GPU_DEBUG alpaka::wait(queue); - std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << threadsPerBlockFindClus + std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus << " threadsPerBlockOrElementsPerThread\n"; #endif alpaka::exec(