Skip to content

Commit

Permalink
clang-format
Browse files Browse the repository at this point in the history
  • Loading branch information
ghugo83 committed Feb 1, 2021
1 parent 805756b commit 99cb983
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 68 deletions.
5 changes: 1 addition & 4 deletions src/alpaka/AlpakaCore/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
#include <cstdint>
#include "AlpakaCore/alpakaConfig.h"


#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED

template <typename T>
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(
uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) {
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) {
// ci and co may be the same
auto x = ci[i];
for (int offset = 1; offset < 32; offset <<= 1) {
Expand All @@ -33,7 +31,6 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, u

#endif


namespace cms {
namespace alpakatools {
// limited to 32*32 elements....
Expand Down
121 changes: 57 additions & 64 deletions src/alpaka/test/alpaka/prefixScan_t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ struct format_traits<float> {
static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n";
};


template <typename T>
struct testPrefixScan {
template <typename T_Acc>
Expand All @@ -28,34 +27,33 @@ struct testPrefixScan {
auto&& co = alpaka::block::shared::st::allocVar<T[1024], __COUNTER__>(acc);

const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_global_index_range(acc);
const auto& [firstElementIdxNoStride, endElementIdxNoStride] = cms::alpakatools::element_global_index_range(acc);

for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; threadIdx += blockDimension, endElementIdx += blockDimension) {
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size;
threadIdx += blockDimension, endElementIdx += blockDimension) {
for (uint32_t i = threadIdx; i < std::min(endElementIdx, size); ++i) {
c[i] = 1;
c[i] = 1;
}
}

alpaka::block::sync::syncBlockThreads(acc);

cms::alpakatools::blockPrefixScan(acc, c, co, size, ws);
cms::alpakatools::blockPrefixScan(acc, c, size, ws);

assert(1 == c[0]);
assert(1 == co[0]);
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size; threadIdx += blockDimension, endElementIdx += blockDimension) {
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u]; threadIdx < size;
threadIdx += blockDimension, endElementIdx += blockDimension) {
for (uint32_t i = threadIdx + 1; i < std::min(endElementIdx, size); ++i) {
assert(c[i] == c[i - 1] + 1);
assert(c[i] == i + 1);
assert(c[i] = co[i]);
assert(c[i] == c[i - 1] + 1);
assert(c[i] == i + 1);
assert(c[i] = co[i]);
}
}

}
};


/*
* NB: GPU-only, so do not care about elements here.
*/
Expand Down Expand Up @@ -85,13 +83,12 @@ struct testWarpPrefixScan {
if (c[i] != c[i - 1] + 1)
printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
assert(c[i] == c[i - 1] + 1);
assert(c[i] == i + 1);
assert(c[i] == i + 1);
assert(c[i] = co[i]);
}
}
};


struct init {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t* v, uint32_t val, uint32_t n) const {
Expand All @@ -107,7 +104,6 @@ struct init {
}
};


struct verify {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t const* v, uint32_t n) const {
Expand All @@ -118,27 +114,25 @@ struct verify {
assert(v[index] == index + 1);

if (index == 0)
printf("verify\n");
printf("verify\n");
}
}
};


int main() {
const DevHost host(alpaka::pltf::getDevByIdx<PltfHost>(0u));
const DevAcc1 device(alpaka::pltf::getDevByIdx<PltfAcc1>(0u));
const Vec1 size(1u);

Queue queue(device);


// WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY)
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
std::cout << "warp level" << std::endl;

const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(32));
const Vec1 blocksPerGrid1(Vec1::all(1));
const WorkDiv1 &workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);
const WorkDiv1& workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);

alpaka::queue::enqueue(queue, alpaka::kernel::createTaskKernel<Acc1>(workDivWarp, testWarpPrefixScan<int>(), 32));

Expand All @@ -153,29 +147,26 @@ int main() {
// Running kernel with 1 block, and bs threads per block or elements per thread.
// NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
for (int bs = 32; bs <= 1024; bs += 32) {

const Vec1 threadsPerBlockOrElementsPerThread2(Vec1::all(bs));
const Vec1 blocksPerGrid2(Vec1::all(1));
const WorkDiv1 &workDivSingleBlock = cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2);

std::cout << "blocks per grid: " << blocksPerGrid2
<< ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2
<< std::endl;

// Problem size
for (int j = 1; j <= 1024; ++j) {

alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivSingleBlock, testPrefixScan<float>(), j));
}
}

const Vec1 threadsPerBlockOrElementsPerThread2(Vec1::all(bs));
const Vec1 blocksPerGrid2(Vec1::all(1));
const WorkDiv1& workDivSingleBlock =
cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2);

std::cout << "blocks per grid: " << blocksPerGrid2
<< ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2 << std::endl;

// Problem size
for (int j = 1; j <= 1024; ++j) {
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivSingleBlock, testPrefixScan<float>(), j));
}
}

// PORTABLE MULTI-BLOCK PREFIXSCAN
int num_items = 200;
for (int ksize = 1; ksize < 4; ++ksize) {
for (int ksize = 1; ksize < 4; ++ksize) {
std::cout << "multiblock" << std::endl;
num_items *= 10;

Expand All @@ -185,48 +176,50 @@ int main() {
auto output1_dBuf = alpaka::mem::buf::alloc<uint32_t, Idx>(device, Vec1::all(num_items));
uint32_t* output1_d = alpaka::mem::view::getPtrNative(output1_dBuf);

const auto nThreadsInit = 256; // NB: 1024 would be better
const auto nThreadsInit = 256; // NB: 1024 would be better
// Just kept here to be identical to CUDA test
const Vec1 threadsPerBlockOrElementsPerThread3(Vec1::all(nThreadsInit));
const auto nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;
const Vec1 blocksPerGrid3(Vec1::all(nBlocksInit));
const WorkDiv1 &workDivMultiBlockInit = cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3);

alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlockInit, init(), input_d, 1, num_items));
const WorkDiv1& workDivMultiBlockInit =
cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3);

alpaka::queue::enqueue(
queue, alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlockInit, init(), input_d, 1, num_items));

const auto nThreads = 1024;
const Vec1 threadsPerBlockOrElementsPerThread4(Vec1::all(nThreads));
const auto nBlocks = (num_items + nThreads - 1) / nThreads;
const Vec1 blocksPerGrid4(Vec1::all(nBlocks));
const WorkDiv1 &workDivMultiBlock = cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4);
const WorkDiv1& workDivMultiBlock =
cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4);

std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlock,
cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(),
input_d,
output1_d,
num_items));

alpaka::queue::enqueue(
queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlock,
cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(),
input_d,
output1_d,
num_items));

const Vec1 blocksPerGridSecondStep(Vec1::all(1));
const WorkDiv1 &workDivMultiBlockSecondStep = cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4);
alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlockSecondStep,
cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(),
input_d,
output1_d,
num_items,
nBlocks));

const WorkDiv1& workDivMultiBlockSecondStep =
cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4);
alpaka::queue::enqueue(
queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlockSecondStep,
cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(),
input_d,
output1_d,
num_items,
nBlocks));

alpaka::queue::enqueue(queue,
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlock,
verify(), output1_d, num_items));
alpaka::kernel::createTaskKernel<Acc1>(workDivMultiBlock, verify(), output1_d, num_items));

alpaka::wait::wait(queue); // input_dBuf and output1_dBuf end of scope
} // ksize
} // ksize

return 0;
}

0 comments on commit 99cb983

Please sign in to comment.