diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index 4f31a9746..153cd0a0f 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -4,12 +4,10 @@ #include #include "AlpakaCore/alpakaConfig.h" - #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED template -ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan( - uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { +ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { // ci and co may be the same auto x = ci[i]; for (int offset = 1; offset < 32; offset <<= 1) { @@ -33,7 +31,6 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, u #endif - namespace cms { namespace alpakatools { // limited to 32*32 elements.... diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index adcd3fa40..4317285a9 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -18,7 +18,6 @@ struct format_traits { static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n"; }; - template struct testPrefixScan { template @@ -28,15 +27,15 @@ struct testPrefixScan { auto&& co = alpaka::block::shared::st::allocVar(acc); const uint32_t blockDimension(alpaka::workdiv::getWorkDiv(acc)[0u]); - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_global_index_range(acc); + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t i = threadIdx; i < std::min(endElementIdx, size); ++i) { - c[i] = 1; + c[i] = 1; } } - + alpaka::block::sync::syncBlockThreads(acc); cms::alpakatools::blockPrefixScan(acc, c, co, size, ws); @@ -44,18 +43,17 @@ struct testPrefixScan { assert(1 == c[0]); assert(1 == co[0]); - for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; threadIdx += blockDimension, endElementIdx += blockDimension) { + for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; + threadIdx += blockDimension, endElementIdx += blockDimension) { for (uint32_t i = threadIdx + 1; i < std::min(endElementIdx, size); ++i) { - assert(c[i] == c[i - 1] + 1); - assert(c[i] == i + 1); - assert(c[i] = co[i]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); } } - } }; - /* * NB: GPU-only, so do not care about elements here. */ @@ -85,13 +83,12 @@ struct testWarpPrefixScan { if (c[i] != c[i - 1] + 1) printf(format_traits::failed_msg, size, i, blockDimension, c[i], c[i - 1]); assert(c[i] == c[i - 1] + 1); - assert(c[i] == i + 1); + assert(c[i] == i + 1); assert(c[i] = co[i]); } } }; - struct init { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t* v, uint32_t val, uint32_t n) const { @@ -107,7 +104,6 @@ struct init { } }; - struct verify { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t const* v, uint32_t n) const { @@ -118,12 +114,11 @@ struct verify { assert(v[index] == index + 1); if (index == 0) - printf("verify\n"); + printf("verify\n"); } } }; - int main() { const DevHost host(alpaka::pltf::getDevByIdx(0u)); const DevAcc1 device(alpaka::pltf::getDevByIdx(0u)); @@ -131,14 +126,13 @@ int main() { Queue queue(device); - // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY) #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED std::cout << "warp level" << std::endl; const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(32)); const Vec1 blocksPerGrid1(Vec1::all(1)); - const WorkDiv1 &workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + const WorkDiv1& workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); @@ -153,29 +147,26 @@ int main() { // Running kernel with 1 block, and bs threads per block or elements per thread. // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version. for (int bs = 32; bs <= 1024; bs += 32) { - - const Vec1 threadsPerBlockOrElementsPerThread2(Vec1::all(bs)); - const Vec1 blocksPerGrid2(Vec1::all(1)); - const WorkDiv1 &workDivSingleBlock = cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2); - - std::cout << "blocks per grid: " << blocksPerGrid2 - << ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2 - << std::endl; - - // Problem size - for (int j = 1; j <= 1024; ++j) { - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); -} -} - + const Vec1 threadsPerBlockOrElementsPerThread2(Vec1::all(bs)); + const Vec1 blocksPerGrid2(Vec1::all(1)); + const WorkDiv1& workDivSingleBlock = + cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2); + + std::cout << "blocks per grid: " << blocksPerGrid2 + << ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2 << std::endl; + + // Problem size + for (int j = 1; j <= 1024; ++j) { + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + alpaka::queue::enqueue(queue, + alpaka::kernel::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + } + } // PORTABLE MULTI-BLOCK PREFIXSCAN int num_items = 200; - for (int ksize = 1; ksize < 4; ++ksize) { + for (int ksize = 1; ksize < 4; ++ksize) { std::cout << "multiblock" << std::endl; num_items *= 10; @@ -185,48 +176,50 @@ int main() { auto output1_dBuf = alpaka::mem::buf::alloc(device, Vec1::all(num_items)); uint32_t* output1_d = alpaka::mem::view::getPtrNative(output1_dBuf); - const auto nThreadsInit = 256; // NB: 1024 would be better + const auto nThreadsInit = 256; // NB: 1024 would be better // Just kept here to be identical to CUDA test const Vec1 threadsPerBlockOrElementsPerThread3(Vec1::all(nThreadsInit)); const auto nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; const Vec1 blocksPerGrid3(Vec1::all(nBlocksInit)); - const WorkDiv1 &workDivMultiBlockInit = cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3); - - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivMultiBlockInit, init(), input_d, 1, num_items)); + const WorkDiv1& workDivMultiBlockInit = + cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3); + alpaka::queue::enqueue( + queue, alpaka::kernel::createTaskKernel(workDivMultiBlockInit, init(), input_d, 1, num_items)); const auto nThreads = 1024; const Vec1 threadsPerBlockOrElementsPerThread4(Vec1::all(nThreads)); const auto nBlocks = (num_items + nThreads - 1) / nThreads; const Vec1 blocksPerGrid4(Vec1::all(nBlocks)); - const WorkDiv1 &workDivMultiBlock = cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4); + const WorkDiv1& workDivMultiBlock = + cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4); std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivMultiBlock, - cms::alpakatools::multiBlockPrefixScanFirstStep(), - input_d, - output1_d, - num_items)); - + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDivMultiBlock, + cms::alpakatools::multiBlockPrefixScanFirstStep(), + input_d, + output1_d, + num_items)); + const Vec1 blocksPerGridSecondStep(Vec1::all(1)); - const WorkDiv1 &workDivMultiBlockSecondStep = cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); - alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivMultiBlockSecondStep, - cms::alpakatools::multiBlockPrefixScanSecondStep(), - input_d, - output1_d, - num_items, - nBlocks)); - + const WorkDiv1& workDivMultiBlockSecondStep = + cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); + alpaka::queue::enqueue( + queue, + alpaka::kernel::createTaskKernel(workDivMultiBlockSecondStep, + cms::alpakatools::multiBlockPrefixScanSecondStep(), + input_d, + output1_d, + num_items, + nBlocks)); alpaka::queue::enqueue(queue, - alpaka::kernel::createTaskKernel(workDivMultiBlock, - verify(), output1_d, num_items)); + alpaka::kernel::createTaskKernel(workDivMultiBlock, verify(), output1_d, num_items)); alpaka::wait::wait(queue); // input_dBuf and output1_dBuf end of scope - } // ksize + } // ksize return 0; }