From 9e2682529ba37a1568d7ffe118dbca2d2fb4be74 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Apr 2024 08:59:10 +0200 Subject: [PATCH] Split implementation to separate header files Rename classes to CamelCase and move them to the detail namespace: - uniform_elements_along to detail::UniformElementsAlong - uniform_groups_along to detail::UniformGroupsAlong - uniform_group_elements_along to detail::UniformGroupElementsAlong - uniform_elements_nd to detail::UniformElementsND - independent_groups_along to detail::IndependentGroupsAlong - independent_group_elements_along to detail::IndependentGroupElementsAlong Move the implementation to separate header files. Introduce helper functions with the old names. --- .../detail/IndependentGroupElementsAlong.h | 122 +++ .../interface/detail/IndependentGroupsAlong.h | 124 +++ .../interface/detail/UniformElementsAlong.h | 176 ++++ .../interface/detail/UniformElementsND.h | 226 +++++ .../detail/UniformGroupElementsAlong.h | 153 +++ .../interface/detail/UniformGroupsAlong.h | 139 +++ .../AlpakaInterface/interface/workdivision.h | 943 ++---------------- 7 files changed, 1049 insertions(+), 834 deletions(-) create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsAlong.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsND.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupElementsAlong.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupsAlong.h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h new file mode 100644 index 0000000000000..995df41dc0380 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h @@ -0,0 +1,122 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* IndependentGroupElementsAlong + * + * `independent_group_elements_along(acc, ...)` is a shorthand for + * `IndependentGroupElementsAlong(acc, ...)` that can infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + class IndependentGroupElementsAlong { + public: + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + thread_{alpaka::getIdx(acc)[Dim] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } + + class const_iterator { + friend class IndependentGroupElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + range_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // increment the thread index with the block stride + first_ += stride_; + index_ = first_; + range_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + range_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx range_; + }; + + private: + const Idx elements_; + const Idx thread_; + const Idx stride_; + const Idx extent_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h new file mode 100644 index 0000000000000..4ea0fe6c23a6e --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h @@ -0,0 +1,124 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* IndependentGroupsAlong + * + * `IndependentGroupsAlong(acc, groups)` returns a one-dimensional iteratable range than spans the group + * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not + * specified, it defaults to the number of blocks along the `Dim` dimension. + * + * `independent_groups_along(acc, ...)` is a shorthand for `IndependentGroupsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + * + * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for + * `IndependentGroupsAlong(acc, ...)`. + * + * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by + * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). + * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands + * for `IndependentGroupsAlong(acc, ...)`, `` and ``. + * + * `independent_groups_along(acc, ...)` should be called consistently by all the threads in a block. All threads + * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations. + * If the work division has more blocks than the required number of groups, the first blocks will perform one + * iteration of the loop, while the other blocks will exit the loop immediately. + * If the work division has less blocks than the required number of groups, some of the blocks will perform more than + * one iteration, in order to cover then whole problem space. + * + * For example, + * + * for (auto group: independent_groups_along(acc, 7)) + * + * will return the group range from 0 to 6, distributed across all blocks in the work division. + * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other + * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will + * process one group while block 7 will no process any. + * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop, + * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the + * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process + * group 3. + */ + + template and alpaka::Dim::value >= Dim>> + class IndependentGroupsAlong { + public: + ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{groups} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } + + class const_iterator { + friend class IndependentGroupsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } + + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; + + private: + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsAlong.h new file mode 100644 index 0000000000000..2f036822e752c --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsAlong.h @@ -0,0 +1,176 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsAlong_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsAlong_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* UniformElementsAlong + * + * `UniformElementsAlong(acc [, first], extent)` returns a one-dimensional iteratable range that spans the + * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. + * If `first` is not specified, it defaults to 0. + * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension. + * + * `uniform_elements_along(acc, ...)` is a shorthand for `UniformElementsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + * + * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `UniformElementsAlong(acc, ...)`. + * + * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed + * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). + * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for + * `UniformElementsAlong(acc, ...)`, `` and ``. + * + * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not + * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. + * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner + * loop over each group's elements, and synchronise only in the outer loop: + * + * for (auto group : uniform_groups_along(acc, extent)) { + * for (auto element : uniform_group_elements_along(acc, group, extent)) { + * // first part of the computation + * // no synchronisations here + * ... + * } + * // wait for all threads to complete the first part + * alpaka::syncBlockThreads(); + * for (auto element : uniform_group_elements_along(acc, group, extent)) { + * // second part of the computation + * // no synchronisations here + * ... + * } + * // wait for all threads to complete the second part + * alpaka::syncBlockThreads(); + * ... + * } + * + * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple + * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may + * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index + * explicitly inside the loop: + * + * for (auto element : uniform_elements_along(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) { + * bool flag = false; + * if (element < extent) { + * // do some work and compute a result flag only for the valid elements + * flag = do_some_work(); + * } + * // check if any valid element had a positive result + * if (alpaka::warp::any(acc, flag)) { + * // ... + * } + * } + * + * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. + */ + + template and alpaka::Dim::value >= Dim>> + class UniformElementsAlong { + public: + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[Dim]}, + first_{alpaka::getIdx(acc)[Dim] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, + extent_{extent} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } + + class const_iterator { + friend class UniformElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + range_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // increment the thread index with the grid stride + first_ += stride_; + index_ = first_; + range_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + range_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx range_; + }; + + private: + const Idx elements_; + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsAlong_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsND.h b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsND.h new file mode 100644 index 0000000000000..c1a64b3fa2128 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsND.h @@ -0,0 +1,226 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsND_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsND_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* UniformElementsND + * + * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices + * required to cover the given problem size, indicated by `extent`. + * + * `uniform_elements_nd(acc, ...)` is an alias for `UniformElementsND(acc, ...)`. + * + * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not + * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. + * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner + * loop over each group's elements, and synchronise only in the outer loop: + * + * for (auto group0 : uniform_groups_along<0>(acc, extent[0])) { + * for (auto group1 : uniform_groups_along<1>(acc, extent[1])) { + * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) { + * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) { + * // first part of the computation + * // no synchronisations here + * ... + * } + * } + * // wait for all threads to complete the first part + * alpaka::syncBlockThreads(); + * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) { + * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) { + * // second part of the computation + * // no synchronisations here + * ... + * } + * } + * // wait for all threads to complete the second part + * alpaka::syncBlockThreads(); + * ... + * } + * } + * + * For more details, see `UniformElementsAlong(acc, ...)`. + */ + + template and (alpaka::Dim::value > 0)>> + class UniformElementsND { + public: + using Dim = alpaka::Dim; + using Vec = alpaka::Vec; + + ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)}, + thread_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent) + : elements_{alpaka::getWorkDiv(acc)}, + thread_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{extent} {} + + // tag used to construct an end iterator + struct at_end_t {}; + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { + // check that all dimensions of the current thread index are within the extent + if ((thread_ < extent_).all()) { + // construct an iterator pointing to the first element to be processed by the current thread + return const_iterator{this, thread_}; + } else { + // construct an end iterator, pointing post the end of the extent + return const_iterator{this, at_end_t{}}; + } + } + + ALPAKA_FN_ACC inline const_iterator end() const { + // construct an end iterator, pointing post the end of the extent + return const_iterator{this, at_end_t{}}; + } + + class const_iterator { + friend class UniformElementsND; + + public: + ALPAKA_FN_ACC inline Vec operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC constexpr inline const_iterator operator++() { + increment(); + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) { + const_iterator old = *this; + increment(); + return old; + } + + ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const { + return (index_ == other.index_); + } + + ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // construct an iterator pointing to the first element to be processed by the current thread + ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first) + : loop_{loop}, + first_{alpaka::elementwise_min(first, loop->extent_)}, + range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, + index_{first_} {} + + // construct an end iterator, pointing post the end of the extent + ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&) + : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {} + + template + ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { + bool overflow = false; + ++index_[I]; + if (index_[I] >= range_[I]) { + index_[I] = first_[I]; + overflow = true; + } + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_elements_loop()) { + return false; + } else { + return do_elements_loops(); + } + } + } + + template + ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { + bool overflow = false; + first_[I] += loop_->stride_[I]; + if (first_[I] >= loop_->extent_[I]) { + first_[I] = loop_->thread_[I]; + overflow = true; + } + index_[I] = first_[I]; + range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]); + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_strided_loop()) { + return false; + } else { + return do_strided_loops(); + } + } + } + + // increment the iterator + ALPAKA_FN_ACC inline constexpr void increment() { + if constexpr (requires_single_thread_per_block_v) { + // linear N-dimensional loops over the elements associated to the thread; + // do_elements_loops<>() returns true if any of those loops overflows + if (not do_elements_loops()) { + // the elements loops did not overflow, return the next index + return; + } + } + + // strided N-dimensional loop over the threads in the kernel launch grid; + // do_strided_loops<>() returns true if any of those loops overflows + if (not do_strided_loops()) { + // the strided loops did not overflow, return the next index + return; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = loop_->extent_; + range_ = loop_->extent_; + index_ = loop_->extent_; + } + + // const pointer to the UniformElementsND that the iterator refers to + const UniformElementsND* loop_; + + // modified by the pre/post-increment operator + Vec first_; // first element processed by this thread + Vec range_; // last element processed by this thread + Vec index_; // current element processed by this thread + }; + + private: + const Vec elements_; + const Vec thread_; + const Vec stride_; + const Vec extent_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_UniformElementsND_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupElementsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupElementsAlong.h new file mode 100644 index 0000000000000..84a53cf6f23d1 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupElementsAlong.h @@ -0,0 +1,153 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupElementsAlong_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupElementsAlong_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* ElementIndex + * + * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs + * returned by `uniform_group_elements` and similar functions. + */ + + struct ElementIndex { + Idx global; + Idx local; + }; + + /* UniformGroupElementsAlong + * + * `UniformGroupElementsAlong(acc, group, elements)` returns a one-dimensional iteratable range that spans + * all the elements within the given `group` along dimension `Dim`, as obtained from `UniformGroupsAlong`, up to + * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it + * defaults to the kernel grid size. + * + * `uniform_group_elements_along(acc, ...)` is a shorthand for `UniformGroupElementsAlong(acc, ...)` + * that can infer the accelerator type from the argument. + * + * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for + * `UniformGroupElementsAlong<0>(acc, ...)`. + * + * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by + * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). + * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are + * shorthands for `UniformGroupElementsAlong(acc, ...)`, `` and ``. + * + * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of + * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the + * local index spans the range from 0 to the block size (excluded). + * + * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the + * global element index reaches `elements`. + * + * If the problem size is not a multiple of the block size, different threads may execute a different number of + * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block + * synchronisation is needed, one should split the loop, and synchronise the threads between the loops. + * See `UniformElementsAlong(acc, ...)` for a concrete example using `uniform_groups_along` and + * `uniform_group_elements_along`. + * + * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a + * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the + * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element + * index explicitly inside the loop: + * + * for (auto element : uniform_group_elements_along(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) { + * bool flag = false; + * if (element < elements) { + * // do some work and compute a result flag only for the valid elements + * flag = do_some_work(); + * } + * // check if any valid element had a positive result + * if (alpaka::warp::any(acc, flag)) { + * // ... + * } + * } + * + * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. + */ + + template and alpaka::Dim::value >= Dim>> + class UniformGroupElementsAlong { + public: + ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block) + : first_{block * alpaka::getWorkDiv(acc)[Dim]}, + local_{alpaka::getIdx(acc)[Dim] * + alpaka::getWorkDiv(acc)[Dim]}, + range_{local_ + alpaka::getWorkDiv(acc)[Dim]} {} + + ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent) + : first_{block * alpaka::getWorkDiv(acc)[Dim]}, + local_{std::min(extent - first_, + alpaka::getIdx(acc)[Dim] * + alpaka::getWorkDiv(acc)[Dim])}, + range_{std::min(extent - first_, local_ + alpaka::getWorkDiv(acc)[Dim])} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); } + + class const_iterator { + friend class UniformGroupElementsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range) + : index_{local}, first_{first}, range_{range} {} + + public: + ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + index_ = range_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // modified by the pre/post-increment operator + Idx index_; + // non-const to support iterator copy and assignment + Idx first_; + Idx range_; + }; + + private: + const Idx first_; + const Idx local_; + const Idx range_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupElementsAlong_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupsAlong.h new file mode 100644 index 0000000000000..b9dfbe843dc99 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupsAlong.h @@ -0,0 +1,139 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupsAlong_h +#define HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupsAlong_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace cms::alpakatools::detail { + + using namespace alpaka_common; + + /* UniformGroupsAlong + * + * `UniformGroupsAlong(acc, elements)` returns a one-dimensional iteratable range than spans the group indices + * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements` + * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size + * along the `Dim` dimension. + * + * `uniform_groups_along(acc, ...)` is a shorthand for `UniformGroupsAlong(acc, ...)` that can infer + * the accelerator type from the argument. + * + * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `UniformGroupsAlong(acc, ...)`. + * + * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by + * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). + * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for + * `UniformGroupsAlong(acc, ...)`, `` and ``. + * + * `uniform_groups_along(acc, ...)` should be called consistently by all the threads in a block. All threads in a + * block see the same loop iterations, while threads in different blocks may see a different number of iterations. + * If the work division has more blocks than the required number of groups, the first blocks will perform one + * iteration of the loop, while the other blocks will exit the loop immediately. + * If the work division has less blocks than the required number of groups, some of the blocks will perform more than + * one iteration, in order to cover then whole problem space. + * + * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller + * than the block size. However, also in this case all threads in the block will execute the same number of iterations + * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop + * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by + * `uniform_group_elements_along(acc, group, elements)`. + * + * For example, if the block size is 64 and there are 400 elements + * + * for (auto group: uniform_groups_along(acc, 400) + * + * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover + * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6, + * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to + * the inner loop to not process the non-existing elements after 399. + * + * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other + * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will + * process one group while block 7 will no process any. + * + * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop, + * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the + * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process + * group 3. + * + * See `UniformElementsAlong(acc, ...)` for a concrete example using `uniform_groups_along` and + * `uniform_group_elements_along`. + */ + + template and alpaka::Dim::value >= Dim>> + class UniformGroupsAlong { + public: + ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{stride_} {} + + // extent is the total number of elements (not blocks) + ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent) + : first_{alpaka::getIdx(acc)[Dim]}, + stride_{alpaka::getWorkDiv(acc)[Dim]}, + extent_{divide_up_by(extent, alpaka::getWorkDiv(acc)[Dim])} {} + + class const_iterator; + using iterator = const_iterator; + + ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } + + ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } + + class const_iterator { + friend class UniformGroupsAlong; + + ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline const_iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline const_iterator operator++(int) { + const_iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } + + ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; + + private: + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + +} // namespace cms::alpakatools::detail + +#endif // HeterogeneousCore_AlpakaInterface_interface_detail_UniformGroupsAlong_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 3475d00e91259..242f98302b7a3 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -6,7 +6,12 @@ #include #include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsAlong.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/UniformElementsND.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupElementsAlong.h" +#include "HeterogeneousCore/AlpakaInterface/interface/detail/UniformGroupsAlong.h" namespace cms::alpakatools { @@ -75,172 +80,6 @@ namespace cms::alpakatools { } } - /* ElementIndex - * - * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs - * returned by `elements_in_block` and similar functions. - */ - - struct ElementIndex { - Idx global; - Idx local; - }; - - /* uniform_elements_along - * - * `uniform_elements_along(acc [, first], extent)` returns a one-dimensional iteratable range that spans the - * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. - * If `first` is not specified, it defaults to 0. - * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension. - * - * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`. - * - * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed - * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for - * `uniform_elements_along(acc, ...)`, `` and ``. - * - * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not - * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. - * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner - * loop over each group's elements, and synchronise only in the outer loop: - * - * for (auto group : uniform_groups_along(acc, extent)) { - * for (auto element : uniform_group_elements_along(acc, group, extent)) { - * // first part of the computation - * // no synchronisations here - * ... - * } - * // wait for all threads to complete the first part - * alpaka::syncBlockThreads(); - * for (auto element : uniform_group_elements_along(acc, group, extent)) { - * // second part of the computation - * // no synchronisations here - * ... - * } - * // wait for all threads to complete the second part - * alpaka::syncBlockThreads(); - * ... - * } - * - * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple - * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may - * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index - * explicitly inside the loop: - * - * for (auto element : uniform_elements_along(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) { - * bool flag = false; - * if (element < extent) { - * // do some work and compute a result flag only for the valid elements - * flag = do_some_work(); - * } - * // check if any valid element had a positive result - * if (alpaka::warp::any(acc, flag)) { - * // ... - * } - * } - * - * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. - */ - - template and alpaka::Dim::value >= Dim>> - class uniform_elements_along { - public: - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx first, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - first_{alpaka::getIdx(acc)[Dim] * elements_ + first}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } - - class const_iterator { - friend class uniform_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{std::min(first, extent)}, - index_{first_}, - range_{std::min(first + elements, extent)} {} - - public: - ALPAKA_FN_ACC inline Idx operator*() const { return index_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) - return *this; - } - - // increment the thread index with the grid stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if (index_ < extent_) - return *this; - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } - - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } - - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // non-const to support iterator copy and assignment - Idx elements_; - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - Idx index_; - Idx range_; - }; - - private: - const Idx elements_; - const Idx first_; - const Idx stride_; - const Idx extent_; - }; - /* uniform_elements * * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices @@ -248,7 +87,7 @@ namespace cms::alpakatools { * If `first` is not specified, it defaults to 0. * If `extent` is not specified, it defaults to the kernel grid size. * - * `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`. + * `uniform_elements(acc, ...)` is a shorthand for `detail::UniformElementsAlong(acc, ...)`. * * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. @@ -301,7 +140,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) { - return uniform_elements_along(acc, static_cast(args)...); + return detail::UniformElementsAlong(acc, static_cast(args)...); + } + + /* uniform_elements_along + * + * `uniform_elements_along(acc, ...)` is a shorthand for `detail::UniformElementsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_elements_along(TAcc const& acc, TArgs... args) { + return detail::UniformElementsAlong(acc, static_cast(args)...); } /* uniform_elements_x, _y, _z @@ -313,346 +166,37 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 1>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 2>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) { - return uniform_elements_along::value - 3>(acc, static_cast(args)...); + return detail::UniformElementsAlong::value - 3>(acc, static_cast(args)...); } /* uniform_elements_nd * - * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices - * required to cover the given problem size, indicated by `extent`. - * - * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not - * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. - * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner - * loop over each group's elements, and synchronise only in the outer loop: - * - * for (auto group0 : uniform_groups_along<0>(acc, extent[0])) { - * for (auto group1 : uniform_groups_along<1>(acc, extent[1])) { - * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) { - * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) { - * // first part of the computation - * // no synchronisations here - * ... - * } - * } - * // wait for all threads to complete the first part - * alpaka::syncBlockThreads(); - * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) { - * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) { - * // second part of the computation - * // no synchronisations here - * ... - * } - * } - * // wait for all threads to complete the second part - * alpaka::syncBlockThreads(); - * ... - * } - * } - * - * For more details, see `uniform_elements_along(acc, ...)`. + * `uniform_elements_nd(acc, ...)` is a shorthand for `detail::UniformElementsND(acc, ...)`. */ template and (alpaka::Dim::value > 0)>> - class uniform_elements_nd { - public: - using Dim = alpaka::Dim; - using Vec = alpaka::Vec; - - ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)}, - thread_{alpaka::getIdx(acc) * elements_}, - stride_{alpaka::getWorkDiv(acc) * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc, Vec extent) - : elements_{alpaka::getWorkDiv(acc)}, - thread_{alpaka::getIdx(acc) * elements_}, - stride_{alpaka::getWorkDiv(acc) * elements_}, - extent_{extent} {} - - // tag used to construct an end iterator - struct at_end_t {}; - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { - // check that all dimensions of the current thread index are within the extent - if ((thread_ < extent_).all()) { - // construct an iterator pointing to the first element to be processed by the current thread - return const_iterator{this, thread_}; - } else { - // construct an end iterator, pointing post the end of the extent - return const_iterator{this, at_end_t{}}; - } - } - - ALPAKA_FN_ACC inline const_iterator end() const { - // construct an end iterator, pointing post the end of the extent - return const_iterator{this, at_end_t{}}; - } - - class const_iterator { - friend class uniform_elements_nd; - - public: - ALPAKA_FN_ACC inline Vec operator*() const { return index_; } - - // pre-increment the iterator - ALPAKA_FN_ACC constexpr inline const_iterator operator++() { - increment(); - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) { - const_iterator old = *this; - increment(); - return old; - } - - ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_); - } - - ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // construct an iterator pointing to the first element to be processed by the current thread - ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, Vec first) - : loop_{loop}, - first_{alpaka::elementwise_min(first, loop->extent_)}, - range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, - index_{first_} {} - - // construct an end iterator, pointing post the end of the extent - ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, at_end_t const&) - : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {} - - template - ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { - bool overflow = false; - ++index_[I]; - if (index_[I] >= range_[I]) { - index_[I] = first_[I]; - overflow = true; - } - return overflow; - } - - template - ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { - if constexpr (N == 0) { - // overflow - return true; - } else { - if (not nth_elements_loop()) { - return false; - } else { - return do_elements_loops(); - } - } - } - - template - ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { - bool overflow = false; - first_[I] += loop_->stride_[I]; - if (first_[I] >= loop_->extent_[I]) { - first_[I] = loop_->thread_[I]; - overflow = true; - } - index_[I] = first_[I]; - range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]); - return overflow; - } - - template - ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { - if constexpr (N == 0) { - // overflow - return true; - } else { - if (not nth_strided_loop()) { - return false; - } else { - return do_strided_loops(); - } - } - } - - // increment the iterator - ALPAKA_FN_ACC inline constexpr void increment() { - if constexpr (requires_single_thread_per_block_v) { - // linear N-dimensional loops over the elements associated to the thread; - // do_elements_loops<>() returns true if any of those loops overflows - if (not do_elements_loops()) { - // the elements loops did not overflow, return the next index - return; - } - } - - // strided N-dimensional loop over the threads in the kernel launch grid; - // do_strided_loops<>() returns true if any of those loops overflows - if (not do_strided_loops()) { - // the strided loops did not overflow, return the next index - return; - } - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = loop_->extent_; - range_ = loop_->extent_; - index_ = loop_->extent_; - } - - // const pointer to the uniform_elements_nd that the iterator refers to - const uniform_elements_nd* loop_; - - // modified by the pre/post-increment operator - Vec first_; // first element processed by this thread - Vec range_; // last element processed by this thread - Vec index_; // current element processed by this thread - }; - - private: - const Vec elements_; - const Vec thread_; - const Vec stride_; - const Vec extent_; - }; - - /* uniform_groups_along - * - * `uniform_groups_along(acc, elements)` returns a one-dimensional iteratable range than spans the group indices - * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements` - * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size - * along the `Dim` dimension. - * - * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`. - * - * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by - * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for - * `uniform_groups_along(acc, ...)`, `` and ``. - * - * `uniform_groups_along` should be called consistently by all the threads in a block. All threads in a block see - * the same loop iterations, while threads in different blocks may see a different number of iterations. - * If the work division has more blocks than the required number of groups, the first blocks will perform one - * iteration of the loop, while the other blocks will exit the loop immediately. - * If the work division has less blocks than the required number of groups, some of the blocks will perform more than - * one iteration, in order to cover then whole problem space. - * - * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller - * than the block size. However, also in this case all threads in the block will execute the same number of iterations - * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop - * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by - * `uniform_group_elements_along(acc, group, elements)`. - * - * For example, if the block size is 64 and there are 400 elements - * - * for (auto group: uniform_groups_along(acc, 400) - * - * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover - * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6, - * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to - * the inner loop to not process the non-existing elements after 399. - * - * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other - * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will - * process one group while block 7 will no process any. - * - * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop, - * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the - * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process - * group 3. - * - * See `uniform_elements_along(acc, ...)` for a concrete example using `uniform_groups_along` and - * `uniform_group_elements_along`. - */ - - template and alpaka::Dim::value >= Dim>> - class uniform_groups_along { - public: - ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{stride_} {} - - // extent is the total number of elements (not blocks) - ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc, Idx extent) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{divide_up_by(extent, alpaka::getWorkDiv(acc)[Dim])} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } - - class const_iterator { - friend class uniform_groups_along; - - ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) - : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} - - public: - ALPAKA_FN_ACC inline Idx operator*() const { return first_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - // increment the first-element-in-block index by the grid stride - first_ += stride_; - if (first_ < extent_) - return *this; - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } - - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } - - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // non-const to support iterator copy and assignment - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - }; + ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc) { + return detail::UniformElementsND(acc); + } - private: - const Idx first_; - const Idx stride_; - const Idx extent_; - }; + template and (alpaka::Dim::value > 0)>> + ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc, alpaka::Vec, Idx> extent) { + return detail::UniformElementsND(acc, extent); + } /* uniform_groups * @@ -660,7 +204,7 @@ namespace cms::alpakatools { * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across * all groups; if not specified, it defaults to the kernel grid size. * - * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`. + * `uniform_groups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong(acc, ...)`. * * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see * the same loop iterations, while threads in different blocks may see a different number of iterations. @@ -672,7 +216,7 @@ namespace cms::alpakatools { * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller * than the block size. However, also in this case all threads in the block will execute the same number of iterations * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop - * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by + * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by * `uniform_group_elements(acc, group, elements)`. * * For example, if the block size is 64 and there are 400 elements @@ -705,7 +249,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) { - return uniform_groups_along(acc, static_cast(args)...); + return detail::UniformGroupsAlong(acc, static_cast(args)...); + } + + /* uniform_groups_along + * + * `uniform_groups_along(acc, ...)` is a shorthand for `detail::UniformGroupsAlong(acc, ...)` that can infer + * the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_groups_along(TAcc const& acc, TArgs... args) { + return detail::UniformGroupsAlong(acc, static_cast(args)...); } /* uniform_groups_x, _y, _z @@ -717,151 +275,30 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 1>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 2>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) { - return uniform_groups_along::value - 3>(acc, static_cast(args)...); + return detail::UniformGroupsAlong::value - 3>(acc, static_cast(args)...); } - /* uniform_group_elements_along - * - * `uniform_group_elements_along(acc, group, elements)` returns a one-dimensional iteratable range that spans all - * the elements within the given `group` along dimension `Dim`, as obtained from `uniform_groups_along`, up to - * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it - * defaults to the kernel grid size. - * - * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for - * `uniform_group_elements_along<0>(acc, ...)`. - * - * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by - * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are - * shorthands for `uniform_group_elements_along(acc, ...)`, `` and ``. - * - * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of - * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the - * local index spans the range from 0 to the block size (excluded). - * - * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the - * global element index reaches `elements`. - * - * If the problem size is not a multiple of the block size, different threads may execute a different number of - * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block - * synchronisation is needed, one should split the loop, and synchronise the threads between the loops. - * See `uniform_elements_along(acc, ...)` for a concrete example using `uniform_groups_along` and - * `uniform_group_elements_along`. - * - * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a - * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the - * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element - * index explicitly inside the loop: - * - * for (auto element : uniform_group_elements_along(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) { - * bool flag = false; - * if (element < elements) { - * // do some work and compute a result flag only for the valid elements - * flag = do_some_work(); - * } - * // check if any valid element had a positive result - * if (alpaka::warp::any(acc, flag)) { - * // ... - * } - * } - * - * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`. - */ - - template and alpaka::Dim::value >= Dim>> - class uniform_group_elements_along { - public: - ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block) - : first_{block * alpaka::getWorkDiv(acc)[Dim]}, - local_{alpaka::getIdx(acc)[Dim] * - alpaka::getWorkDiv(acc)[Dim]}, - range_{local_ + alpaka::getWorkDiv(acc)[Dim]} {} - - ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block, Idx extent) - : first_{block * alpaka::getWorkDiv(acc)[Dim]}, - local_{std::min(extent - first_, - alpaka::getIdx(acc)[Dim] * - alpaka::getWorkDiv(acc)[Dim])}, - range_{std::min(extent - first_, local_ + alpaka::getWorkDiv(acc)[Dim])} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); } - - class const_iterator { - friend class uniform_group_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range) - : index_{local}, first_{first}, range_{range} {} - - public: - ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) - return *this; - } - - // the iterator has reached or passed the end of the extent, clamp it to the extent - index_ = range_; - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } - - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); } - - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // modified by the pre/post-increment operator - Idx index_; - // non-const to support iterator copy and assignment - Idx first_; - Idx range_; - }; - - private: - const Idx first_; - const Idx local_; - const Idx range_; - }; - /* uniform_group_elements * * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size. * - * `uniform_group_elements(acc, ...)` is a shorthand for `uniform_group_elements_along<0>(acc, ...)`. + * `uniform_group_elements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`. * * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the @@ -894,7 +331,7 @@ namespace cms::alpakatools { * * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional * kernels, use - * - `uniform_group_elements_along(acc, ...)` to perform the iteration explicitly along dimension `Dim`; + * - `detail::UniformGroupElementsAlong(acc, ...)` to perform the iteration explicitly along dimension `Dim`; * - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or * `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension. */ @@ -903,7 +340,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong(acc, static_cast(args)...); + } + + /* uniform_group_elements_along + * + * `uniform_group_elements_along(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong(acc, ...)` + * that can infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto uniform_group_elements_along(TAcc const& acc, TArgs... args) { + return detail::UniformGroupElementsAlong(acc, static_cast(args)...); } /* uniform_group_elements_x, _y, _z @@ -916,132 +367,29 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 1>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 2>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) { - return uniform_group_elements_along::value - 3>(acc, static_cast(args)...); + return detail::UniformGroupElementsAlong::value - 3>(acc, static_cast(args)...); } - /* independent_groups_along - * - * `independent_groups_along(acc, groups)` returns a one-dimensional iteratable range than spans the group - * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not - * specified, it defaults to the number of blocks along the `Dim` dimension. - * - * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for - * `independent_groups_along<0>(acc, ...)`. - * - * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by - * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). - * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands - * for `independent_groups_along(acc, ...)`, `` and ``. - * - * `independent_groups_along` should be called consistently by all the threads in a block. All threads in a block - * see the same loop iterations, while threads in different blocks may see a different number of iterations. - * If the work division has more blocks than the required number of groups, the first blocks will perform one - * iteration of the loop, while the other blocks will exit the loop immediately. - * If the work division has less blocks than the required number of groups, some of the blocks will perform more than - * one iteration, in order to cover then whole problem space. - * - * For example, - * - * for (auto group: independent_groups_along(acc, 7)) - * - * will return the group range from 0 to 6, distributed across all blocks in the work division. - * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other - * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will - * process one group while block 7 will no process any. - * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop, - * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the - * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process - * group 3. - */ - - template and alpaka::Dim::value >= Dim>> - class independent_groups_along { - public: - ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc, Idx groups) - : first_{alpaka::getIdx(acc)[Dim]}, - stride_{alpaka::getWorkDiv(acc)[Dim]}, - extent_{groups} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); } - - class const_iterator { - friend class independent_groups_along; - - ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first) - : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} - - public: - ALPAKA_FN_ACC inline Idx operator*() const { return first_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - // increment the first-element-in-block index by the grid stride - first_ += stride_; - if (first_ < extent_) - return *this; - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } - - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); } - - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // non-const to support iterator copy and assignment - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - }; - - private: - const Idx first_; - const Idx stride_; - const Idx extent_; - }; - /* independent_groups * * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to * `groups`. If `groups` is not specified, it defaults to the number of blocks. * - * `independent_groups(acc, ...)` is a shorthand for `independent_groups_along<0>(acc, ...)`. + * `independent_groups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong(acc, ...)`. * * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block * see the same loop iterations, while threads in different blocks may see a different number of iterations. @@ -1074,7 +422,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and alpaka::Dim::value == 1>> ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) { - return independent_groups_along(acc, static_cast(args)...); + return detail::IndependentGroupsAlong(acc, static_cast(args)...); + } + + /* independent_groups_along + * + * `independent_groups_along(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong(acc, ...)` that can + * infer the accelerator type from the argument. + */ + + template and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto independent_groups_along(TAcc const& acc, TArgs... args) { + return detail::IndependentGroupsAlong(acc, static_cast(args)...); } /* independent_groups_x, _y, _z @@ -1087,132 +449,45 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 1>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 2>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) { - return independent_groups_along::value - 3>(acc, static_cast(args)...); + return detail::IndependentGroupsAlong::value - 3>(acc, static_cast(args)...); } - /* independent_group_elements_along + /* independent_group_elements */ template and alpaka::Dim::value >= Dim>> - class independent_group_elements_along { - public: - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{stride_} {} - - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx first, Idx extent) - : elements_{alpaka::getWorkDiv(acc)[Dim]}, - thread_{alpaka::getIdx(acc)[Dim] * elements_ + first}, - stride_{alpaka::getWorkDiv(acc)[Dim] * elements_}, - extent_{extent} {} - - class const_iterator; - using iterator = const_iterator; - - ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); } - - ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); } - - class const_iterator { - friend class independent_group_elements_along; - - ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{std::min(first, extent)}, - index_{first_}, - range_{std::min(first + elements, extent)} {} - - public: - ALPAKA_FN_ACC inline Idx operator*() const { return index_; } - - // pre-increment the iterator - ALPAKA_FN_ACC inline const_iterator& operator++() { - if constexpr (requires_single_thread_per_block_v) { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < range_) - return *this; - } - - // increment the thread index with the block stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if (index_ < extent_) - return *this; - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; - return *this; - } - - // post-increment the iterator - ALPAKA_FN_ACC inline const_iterator operator++(int) { - const_iterator old = *this; - ++(*this); - return old; - } - - ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } - - ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); } - - private: - // non-const to support iterator copy and assignment - Idx elements_; - Idx stride_; - Idx extent_; - // modified by the pre/post-increment operator - Idx first_; - Idx index_; - Idx range_; - }; - - private: - const Idx elements_; - const Idx thread_; - const Idx stride_; - const Idx extent_; - }; + typename... TArgs, + typename = std::enable_if_t and alpaka::Dim::value == 1>> + ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) { + return detail::IndependentGroupElementsAlong(acc, static_cast(args)...); + } - /* independent_group_elements + /* independent_group_elements_along + * + * `independent_group_elements_along(acc, ...)` is a shorthand for + * `detail::IndependentGroupElementsAlong(acc, ...)` that can infer the accelerator type from the argument. */ template and alpaka::Dim::value == 1>> - ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) { - return independent_group_elements_along(acc, static_cast(args)...); + typename = std::enable_if_t and alpaka::Dim::value >= Dim>> + ALPAKA_FN_ACC inline auto independent_group_elements_along(TAcc const& acc, TArgs... args) { + return detail::IndependentGroupElementsAlong(acc, static_cast(args)...); } /* independent_group_elements_x, _y, _z @@ -1225,21 +500,21 @@ namespace cms::alpakatools { typename... TArgs, typename = std::enable_if_t and (alpaka::Dim::value > 0)>> ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 1>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 1>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 1)>> ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 2>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 2>(acc, static_cast(args)...); } template and (alpaka::Dim::value > 2)>> ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) { - return independent_group_elements_along::value - 3>(acc, static_cast(args)...); + return detail::IndependentGroupElementsAlong::value - 3>(acc, static_cast(args)...); } /* once_per_grid