Split implementation to separate header files

Rename classes to CamelCase and move them to the detail namespace: - uniform_elements_along to detail::UniformElementsAlong - uniform_groups_along to detail::UniformGroupsAlong - uniform_group_elements_along to detail::UniformGroupElementsAlong - uniform_elements_nd to detail::UniformElementsND - independent_groups_along to detail::IndependentGroupsAlong - independent_group_elements_along to detail::IndependentGroupElementsAlong Move the implementation to separate header files. Introduce helper functions with the old names.
cms-sw · Apr 11, 2024 · 9e26825 · 9e26825
1 parent f7c445c
commit 9e26825
Show file tree

Hide file tree

Showing 7 changed files with 1,049 additions and 834 deletions.
diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupElementsAlong.h
@@ -0,0 +1,122 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h
+#define HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+namespace cms::alpakatools::detail {
+
+  using namespace alpaka_common;
+
+  /* IndependentGroupElementsAlong
+   *
+   * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
+   * `IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
+   */
+
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class IndependentGroupElementsAlong {
+  public:
+    ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          extent_{extent} {}
+
+    ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
+          extent_{extent} {}
+
+    class const_iterator;
+    using iterator = const_iterator;
+
+    ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }
+
+    ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
+
+    class const_iterator {
+      friend class IndependentGroupElementsAlong;
+
+      ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{std::min(first, extent)},
+            index_{first_},
+            range_{std::min(first + elements, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline const_iterator& operator++() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < range_)
+            return *this;
+        }
+
+        // increment the thread index with the block stride
+        first_ += stride_;
+        index_ = first_;
+        range_ = std::min(first_ + elements_, extent_);
+        if (index_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        range_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline const_iterator operator++(int) {
+        const_iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx elements_;
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+      Idx index_;
+      Idx range_;
+    };
+
+  private:
+    const Idx elements_;
+    const Idx thread_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+}  // namespace cms::alpakatools::detail
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupElementsAlong_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h b/HeterogeneousCore/AlpakaInterface/interface/detail/IndependentGroupsAlong.h
@@ -0,0 +1,124 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h
+#define HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+namespace cms::alpakatools::detail {
+
+  using namespace alpaka_common;
+
+  /* IndependentGroupsAlong
+   *
+   * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
+   * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
+   * specified, it defaults to the number of blocks along the `Dim` dimension.
+   *
+   * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
+   * infer the accelerator type from the argument.
+   *
+   * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
+   * `IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
+   *
+   * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
+   * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
+   * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
+   * for `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+   *
+   * `independent_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads
+   * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations.
+   * If the work division has more blocks than the required number of groups, the first blocks will perform one
+   * iteration of the loop, while the other blocks will exit the loop immediately.
+   * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
+   * one iteration, in order to cover then whole problem space.
+   *
+   * For example,
+   *
+   *   for (auto group: independent_groups_along<Dim>(acc, 7))
+   *
+   * will return the group range from 0 to 6, distributed across all blocks in the work division.
+   * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+   * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
+   * process one group while block 7 will no process any.
+   * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
+   * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
+   * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
+   * group 3.
+   */
+
+  template <typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+  class IndependentGroupsAlong {
+  public:
+    ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
+          extent_{groups} {}
+
+    class const_iterator;
+    using iterator = const_iterator;
+
+    ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
+
+    class const_iterator {
+      friend class IndependentGroupsAlong;
+
+      ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
+          : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline const_iterator& operator++() {
+        // increment the first-element-in-block index by the grid stride
+        first_ += stride_;
+        if (first_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline const_iterator operator++(int) {
+        const_iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+    };
+
+  private:
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+}  // namespace cms::alpakatools::detail
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_detail_IndependentGroupsAlong_h