From e25884b43f605969fc985be2e81614ec741d237d Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 19 Dec 2023 21:37:18 +0100 Subject: [PATCH 1/5] Add uniform_groups and uniform_group_elements type aliases --- .../AlpakaInterface/interface/workdivision.h | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 39f19fe463745..7e181363b1290 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -496,6 +496,46 @@ namespace cms::alpakatools { const Idx range_; }; + /* uniform_groups + * + * `uniform_groups(acc, elements)` returns a range than spans the group indices required to cover the given problem + * size, in units of the block size: + * - the `elements` argument indicates the total number of elements, across all groups. + * + * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same + * loop iterations, while threads in different blocks may see a different number of iterations. + * + * For example, if `size` is 1000 and the block size is 16, + * + * for (auto group: uniform_groups(acc, 1000) + * + * will return the range from 0 to 62, split across all blocks in the work division. + * + * If the work division has more than 63 blocks, the first 63 will perform one iteration of the loop, while the other + * blocks will exit immediately. + * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to + * cover then whole problem space. + */ + + template and alpaka::Dim::value == 1>> + using uniform_groups = blocks_with_stride; + + /* uniform_group_elements + * + * `uniform_group_elements(acc, group, elements)` returns a range that spans all the elements within the given group: + * - the `group` argument indicates the id of the current group, for example as obtained from `uniform_groups`; + * - the `elements` argument indicates the total number of elements, across all groups. + * + * Iterating over the range yields values of type `ElementIndex`, that contain the `.global` and `.local` indices of + * the corresponding element. + * + * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier when the + * element index reaches `size`. + */ + + template and alpaka::Dim::value == 1>> + using uniform_group_elements = elements_in_block; + /* once_per_grid * * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid. From 6844aadf7892fe598a43fa8a31c99e1c5605edcd Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 21 Dec 2023 00:55:38 +0100 Subject: [PATCH 2/5] Add independent_groups and independent_group_elements helper classes - `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group per block; - `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group, from 0 to `elements`. --- .../AlpakaInterface/interface/workdivision.h | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 7e181363b1290..220abe46b1925 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -536,6 +536,179 @@ namespace cms::alpakatools { template and alpaka::Dim::value == 1>> using uniform_group_elements = elements_in_block; + /* independent_groups + * + * `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group + * per block: + * - the `groups` argument indicates the total number of groups. + * + * If the work division has more blocks than `groups`, only the first `groups` blocks will perform one iteration of + * the loop, while the other blocks will exit immediately. + * If the work division has less blocks than `groups`, some of the blocks will perform more than one iteration, in + * order to cover then whole problem space. + */ + + template and alpaka::Dim::value == 1>> + class independent_groups { + public: + ALPAKA_FN_ACC inline independent_groups(TAcc const& acc) + : first_{alpaka::getIdx(acc)[0u]}, + stride_{alpaka::getWorkDiv(acc)[0u]}, + extent_{stride_} {} + + // extent is the total number of elements (not blocks) + ALPAKA_FN_ACC inline independent_groups(TAcc const& acc, Idx groups) + : first_{alpaka::getIdx(acc)[0u]}, + stride_{alpaka::getWorkDiv(acc)[0u]}, + extent_{groups} {} + + class iterator { + friend class independent_groups; + + ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); } + + private: + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + + /* independent_group_elements + * + * `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group: + * - the `elements` argument indicates the number of elements in the current group. + * + * Iterating over the range yields the local element index, between `0` and `elements - 1`. The threads in the block + * will perform one or more iterations, depending on the number of elements per thread, and on the number of threads + * per block, ocmpared with the total number of elements. + */ + + template and alpaka::Dim::value == 1>> + class independent_group_elements { + public: + ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + thread_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + thread_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + + class iterator { + friend class independent_group_elements; + + ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + range_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // increment the thread index with the block stride + first_ += stride_; + index_ = first_; + range_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + range_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx range_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + + private: + const Idx elements_; + const Idx thread_; + const Idx stride_; + const Idx extent_; + }; + /* once_per_grid * * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid. From 3dfb1a9f97c149aea43cd6206626ad188a0b1e06 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 21 Dec 2023 00:58:05 +0100 Subject: [PATCH 3/5] Add a test for independent_groups and independent_group_elements --- .../AlpakaInterface/test/BuildFile.xml | 7 + .../test/alpaka/testIndependentKernel.dev.cc | 144 ++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 5f9c5fe81981f..2d204819d740b 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -12,6 +12,13 @@ + + + + + + + diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc new file mode 100644 index 0000000000000..bd98efcfa32d6 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc @@ -0,0 +1,144 @@ +#include +#include + +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +// each test binary is built for a single Alpaka backend +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +/* Add the group id to te value of each element in the group. + * Each group is composed by the elements first[group]..first[group+1]-1 . + */ +struct IndependentWorkKernel { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + T const* __restrict__ in, + T* __restrict__ out, + size_t const* __restrict__ indices, + size_t groups) const { + for (auto group : cms::alpakatools::independent_groups(acc, groups)) { + size_t first = indices[group]; + size_t last = indices[group + 1]; + size_t size = last - first; + for (auto index : cms::alpakatools::independent_group_elements(acc, size)) { + out[first + index] = in[first + index] + group; + } + } + } +}; + +/* Test the IndependentWorkKernel kernel on all devices + */ +template +void testIndependentWorkKernel(size_t groups, size_t grid_size, size_t block_size, TKernel kernel) { + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine engine{rd()}; + + // uniform distribution + std::uniform_int_distribution random_size{100, 201}; + + // gaussian distribution + std::normal_distribution dist{0., 1.}; + + // build the groups + std::vector sizes(groups); + auto indices_h = cms::alpakatools::make_host_buffer(groups + 1); + indices_h[0] = 0; + for (size_t i = 0; i < groups; ++i) { + auto size = random_size(engine); + sizes[i] = size; + indices_h[i + 1] = indices_h[i] + size; + } + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + const size_t size = indices_h[groups]; + + // allocate the input and output host buffer in pinned memory accessible by the Platform devices + auto in_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in_h[i] = dist(engine); + out_h[i] = 0; + } + + // run the test on each device + for (auto const& device : cms::alpakatools::devices()) { + std::cout << "Test IndependentWorkKernel on " << alpaka::getName(device) << " over " << size << " elements in " + << groups << " independent groups with " << grid_size << " blocks of " << block_size << " elements\n"; + auto queue = Queue(device); + + // allocate input and output buffers on the device + auto indices_d = cms::alpakatools::make_device_buffer(queue, groups + 1); + auto in_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, indices_d, indices_h); + alpaka::memcpy(queue, in_d, in_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); + + // launch the 1-dimensional kernel with independent work groups + auto div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::exec(queue, div, kernel, in_d.data(), out_d.data(), indices_d.data(), groups); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t g = 0; g < groups; ++g) { + size_t first = indices_h[g]; + size_t last = indices_h[g + 1]; + for (size_t i = first; i < last; ++i) { + float sum = in_h[i] + g; + float delta = std::max(std::fabs(sum) * epsilon, epsilon); + REQUIRE(out_h[i] < sum + delta); + REQUIRE(out_h[i] > sum - delta); + } + } + } +} + +TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + SECTION("Independent work groups") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)); + REQUIRE(not devices.empty()); + } + + // launch the independent work kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test independent work kernel with small block size, using scalar dimensions\n"; + testIndependentWorkKernel(100, 32, 32, IndependentWorkKernel{}); + + // launch the independent work kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test independent work kernel with large block size, using scalar dimensions\n"; + testIndependentWorkKernel(100, 1, 1024, IndependentWorkKernel{}); + + // launch the independent work kernel with a large block size and a large number of blocks; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test independent work kernel with large block size, using scalar dimensions\n"; + testIndependentWorkKernel(100, 1024, 1024, IndependentWorkKernel{}); + } +} From ce64e0a4583ff6d9660861d2cd83ea1355d6fd4c Mon Sep 17 00:00:00 2001 From: Eric Cano Date: Thu, 21 Dec 2023 11:44:38 +0100 Subject: [PATCH 4/5] Add support for shifted elements_with_stride loop in range for --- .../AlpakaInterface/interface/workdivision.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 220abe46b1925..0433980e7c6d6 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -86,6 +86,11 @@ namespace cms::alpakatools { }; /* elements_with_stride + * + * `elements_with_stride(acc, [first, ]extent)` returns an iteratable range that spans the element indices required to + * cover the given problem size: + * - `first` (optional) is index to the first element; if not specified, the loop starts from 0; + * - `extent` is the total size of the problem, including any elements that may come before `first`. */ template and alpaka::Dim::value == 1>> @@ -93,13 +98,19 @@ namespace cms::alpakatools { public: ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) : elements_{alpaka::getWorkDiv(acc)[0u]}, - thread_{alpaka::getIdx(acc)[0u] * elements_}, + first_{alpaka::getIdx(acc)[0u] * elements_}, stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{stride_} {} ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent) : elements_{alpaka::getWorkDiv(acc)[0u]}, - thread_{alpaka::getIdx(acc)[0u] * elements_}, + first_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + + ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + first_{alpaka::getIdx(acc)[0u] * elements_ + first}, stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{extent} {} @@ -164,13 +175,13 @@ namespace cms::alpakatools { Idx range_; }; - ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); } + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } private: const Idx elements_; - const Idx thread_; + const Idx first_; const Idx stride_; const Idx extent_; }; From d5e466bc9428bdb2c33f8df1b6ddfb52dc220788 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 21 Dec 2023 12:08:33 +0100 Subject: [PATCH 5/5] Add tests for the shifted elements_with_stride --- .../test/alpaka/testKernel.dev.cc | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc index 300f139b0c6e3..a730e4b515a76 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc @@ -23,6 +23,20 @@ struct VectorAddKernel { } }; +struct VectorAddKernelSkip { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + T const* __restrict__ in1, + T const* __restrict__ in2, + T* __restrict__ out, + size_t first, + size_t size) const { + for (auto index : cms::alpakatools::elements_with_stride(acc, first, size)) { + out[index] = in1[index] + in2[index]; + } + } +}; + struct VectorAddKernel1D { template ALPAKA_FN_ACC void operator()( @@ -224,6 +238,76 @@ void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::s } } +// test the 1-dimensional kernel on all devices, potentially skipping some elements +template +void testVectorAddKernelSkip(std::size_t skip_elements, + std::size_t problem_size, + std::size_t grid_size, + std::size_t block_size, + TKernel kernel) { + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + const size_t size = problem_size; + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test on each device + for (auto const& device : cms::alpakatools::devices()) { + std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " skipping " << skip_elements << " over " + << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n"; + auto queue = Queue(device); + + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); + + // launch the 1-dimensional kernel with scalar size + auto div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::exec(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), skip_elements, size); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t i = 0; i < skip_elements; ++i) { + REQUIRE(out_h[i] == 0); + } + for (size_t i = skip_elements; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); + } + } +} + // test the N-dimensional kernels on all devices template void testVectorAddKernelND(Vec problem_size, Vec grid_size, Vec block_size, TKernel kernel) { @@ -367,5 +451,15 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n"; testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{}); + + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n"; + testVectorAddKernelSkip(20, 10000, 32, 32, VectorAddKernelSkip{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n"; + testVectorAddKernelSkip(20, 100, 1, 1024, VectorAddKernelSkip{}); } }