From 2d77f78416fc33a06464d8faad28abb6093f2f38 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 8 Nov 2024 00:31:38 +0200 Subject: [PATCH] Update testPrefixScan to work with different warp sizes --- .../test/alpaka/testPrefixScan.dev.cc | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc index 75a2f310e4fb4..a9df1ab547611 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc @@ -34,7 +34,17 @@ template struct testPrefixScan { template ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const { + // alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value +#if defined(__CUDA_ARCH__) + // CUDA always has a warp size of 32 auto& ws = alpaka::declareSharedVar(acc); +#elif defined(__HIP_DEVICE_COMPILE__) + // HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device + auto& ws = alpaka::declareSharedVar(acc); +#else + // CPU back-ends always have a warp size of 1 + auto& ws = alpaka::declareSharedVar(acc); +#endif auto& c = alpaka::declareSharedVar(acc); auto& co = alpaka::declareSharedVar(acc); @@ -78,7 +88,7 @@ struct testWarpPrefixScan { template ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const { if constexpr (!requires_single_thread_per_block_v) { - ALPAKA_ASSERT_ACC(size <= 32); + ALPAKA_ASSERT_ACC(size <= static_cast(alpaka::warp::getSize(acc))); auto& c = alpaka::declareSharedVar(acc); auto& co = alpaka::declareSharedVar(acc); @@ -87,7 +97,8 @@ struct testWarpPrefixScan { auto i = blockThreadIdx; c[i] = 1; alpaka::syncBlockThreads(acc); - auto laneId = blockThreadIdx & 0x1f; + // a compile-time constant would be faster, but this is more portable + auto laneId = blockThreadIdx % alpaka::warp::getSize(acc); warpPrefixScan(acc, laneId, c, co, i); warpPrefixScan(acc, laneId, c, i); @@ -152,13 +163,18 @@ int main() { if constexpr (!requires_single_thread_per_block_v) { std::cout << "warp level" << std::endl; - const auto threadsPerBlockOrElementsPerThread = 32; + const auto threadsPerBlockOrElementsPerThread = warpSize; const auto blocksPerGrid = 1; const auto workDivWarp = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); - alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 16)); - alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 5)); + if (warpSize >= 64) + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 64)); + if (warpSize >= 32) + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); + if (warpSize >= 16) + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 12)); + if (warpSize >= 8) + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 5)); } // PORTABLE BLOCK PREFIXSCAN @@ -166,7 +182,7 @@ int main() { // Running kernel with 1 block, and bs threads per block or elements per thread. // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version. - for (int bs = 32; bs <= 1024; bs += 32) { + for (int bs = warpSize; bs <= 1024; bs += warpSize) { const auto blocksPerGrid2 = 1; const auto workDivSingleBlock = make_workdiv(blocksPerGrid2, bs);