From 3eed8adec9293f3c061271b3b6c7498029847c9d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 6 Nov 2023 01:44:10 +0100
Subject: [PATCH 1/5] Simplify and extend the alpaka kernel tests

---
 .../test/alpaka/testKernel.dev.cc             | 348 ++++++++----------
 1 file changed, 147 insertions(+), 201 deletions(-)
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index c35965fa8793b..5866137f547f0 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -13,8 +13,6 @@
 // each test binary is built for a single Alpaka backend
 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
-static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel) "]";
-
 struct VectorAddKernel {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
@@ -58,233 +56,181 @@ struct VectorAddKernel3D {
   }
 };
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) {
-  SECTION("VectorAddKernel") {
-    // get the list of devices on the current platform
-    auto const& devices = cms::alpakatools::devices<Platform>();
-    if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
-    }
-
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
-
-    // tolerance
-    constexpr float epsilon = 0.000001;
-
-    // buffer size
-    constexpr size_t size = 1024 * 1024;
-
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-
-    // fill the input buffers with random data, and the output buffer with zeros
-    for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
-    }
-
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 1D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
-
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
-
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
-
-      // launch the 1-dimensional kernel with scalar size
-      auto div = cms::alpakatools::make_workdiv<Acc1D>(4, 4);
-      alpaka::exec<Acc1D>(queue, div, VectorAddKernel{}, in1_d.data(), in2_d.data(), out_d.data(), size);
+// test the 1-dimensional kernel on all devices
+template <typename TKernel>
+void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  const size_t size = problem_size;
+
+  // allocate input and output host buffers in pinned memory accessible by the Platform devices
+  auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
 
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
+              << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
 
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+    // allocate input and output buffers on the device
+    auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, in1_d, in1_h);
+    alpaka::memcpy(queue, in2_d, in2_h);
 
-      // reset the output buffer on the device to all zeros
-      alpaka::memset(queue, out_d, 0.);
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
 
-      // launch the 1-dimensional kernel with vector size
-      alpaka::exec<Acc1D>(queue, div, VectorAddKernel1D{}, in1_d.data(), in2_d.data(), out_d.data(), size);
+    // launch the 1-dimensional kernel with scalar size
+    auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
 
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
 
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+    // wait for all the operations to complete
+    alpaka::wait(queue);
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
+    // check the results
+    for (size_t i = 0; i < size; ++i) {
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
     }
   }
 }
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) {
-  SECTION("VectorAddKernel2D") {
-    // get the list of devices on the current platform
-    auto const& devices = cms::alpakatools::devices<Platform>();
-    if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
-    }
-
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
-
-    // tolerance
-    constexpr float epsilon = 0.000001;
-
-    // 3-dimensional and linearised buffer size
-    constexpr Vec2D ndsize = {16, 16};
-    constexpr size_t size = ndsize.prod();
-
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-
-    // fill the input buffers with random data, and the output buffer with zeros
-    for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
-    }
+// test the N-dimensional kernels on all devices
+template <typename TDim, typename TKernel>
+void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // linearised buffer size
+  const size_t size = problem_size.prod();
+
+  // allocate input and output host buffers in pinned memory accessible by the Platform devices
+  auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
 
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
+              << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
 
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    // allocate input and output buffers on the device
+    auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
 
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, in1_d, in1_h);
+    alpaka::memcpy(queue, in2_d, in2_h);
 
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
 
-      // launch the 3-dimensional kernel
-      auto div = cms::alpakatools::make_workdiv<Acc2D>({4, 4}, {32, 32});
-      alpaka::exec<Acc2D>(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
+    // launch the 3-dimensional kernel
+    using AccND = Acc<TDim>;
+    auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
+    alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
 
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
 
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+    // wait for all the operations to complete
+    alpaka::wait(queue);
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
+    // check the results
+    for (size_t i = 0; i < size; ++i) {
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
     }
   }
 }
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
-  SECTION("VectorAddKernel3D") {
+TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  SECTION("Alpaka N-dimensional kernels") {
     // get the list of devices on the current platform
     auto const& devices = cms::alpakatools::devices<Platform>();
     if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
+      INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
+      REQUIRE(not devices.empty());
     }
 
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
-
-    // tolerance
-    constexpr float epsilon = 0.000001;
-
-    // 3-dimensional and linearised buffer size
-    constexpr Vec3D ndsize = {50, 125, 16};
-    constexpr size_t size = ndsize.prod();
-
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-
-    // fill the input buffers with random data, and the output buffer with zeros
-    for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
-    }
-
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
-
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
-
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
-
-      // launch the 3-dimensional kernel
-      auto div = cms::alpakatools::make_workdiv<Acc3D>({5, 5, 1}, {4, 4, 4});
-      alpaka::exec<Acc3D>(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
-
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
-
-      // wait for all the operations to complete
-      alpaka::wait(queue);
-
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
-    }
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector addition with small block size\n";
+    testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector addition with large block size\n";
+    testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
+
+    // launch the 2-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 2D vector addition with small block size\n";
+    testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
+
+    // launch the 2-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 2D vector addition with large block size\n";
+    testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
+
+    // launch the 3-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 3D vector addition with small block size\n";
+    testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
+
+    // launch the 3-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 3D vector addition with large block size\n";
+    testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
   }
 }

From 727268383b09e2a25cf7dad8ea1e01821bbb6ef0 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 6 Nov 2023 18:09:24 +0100
Subject: [PATCH 2/5] Add the blocks_with_stride and elements_in_block ranges

`blocks_with_stride(acc, size)` returns a range than spans the
(virtual) block indices required to cover the given problem size.

For example, if size is 1000 and the block size is 16, it will return
the range from 1 to 62.
If the work division has more than 63 blocks, only the first 63 will
perform one iteration of the loop, and the other will exit immediately.
if the work division has less than 63 blocks, some of the blocks will
perform more than one iteration, in order to cover then whole problem
space.

All threads in a block see the same loop iterations, while threads in
different blocks may see a different number of iterations.

`elements_in_block(acc, block, size)` returns a range that spans all
the elements within the given block.
Iterating over the range yields values of type ElementIndex, that
contain both .global and .local indices of the corresponding element.

If the work division has only one element per thread, the loop will
perform at most one iteration. If the work division has more than one
elements per thread, the loop will perform that number of iterations,
or less if it reaches size.
---
 .../AlpakaInterface/interface/workdivision.h  | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 7449bb153c9f7..7f0c719148677 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -75,6 +75,19 @@ namespace cms::alpakatools {
     }
   }
 
+  /* ElementIndex
+   *
+   * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block.
+   */
+
+  struct ElementIndex {
+    Idx global;
+    Idx local;
+  };
+
+  /* elements_with_stride
+   */
+
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class elements_with_stride {
   public:
@@ -326,6 +339,163 @@ namespace cms::alpakatools {
     const Vec extent_;
   };
 
+  /* blocks_with_stride
+   *
+   * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given
+   * problem size.
+   *
+   * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62.
+   * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the
+   * other will exit immediately.
+   * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
+   * cover then whole problem space.
+   *
+   * All threads in a block see the same loop iterations, while threads in different blocks may see a different number
+   * of iterations.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class blocks_with_stride {
+  public:
+    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{stride_} {}
+
+    // extent is the total number of elements (not blocks)
+    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u])} {}
+
+    class iterator {
+      friend class blocks_with_stride;
+
+      ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
+          : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the first-element-in-block index by the grid stride
+        first_ += stride_;
+        if (first_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }
+
+  private:
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+  /* elements_in_block
+   *
+   * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block.
+   * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices
+   * of the corresponding element.
+   *
+   * If the work division has only one element per thread, the loop will perform at most one iteration.
+   * If the work division has more than one elements per thread, the loop will perform that number of iterations,
+   * or less if it reaches size.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class elements_in_block {
+  public:
+    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
+          local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
+                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]} {}
+
+    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
+          local_{std::min(extent - first_,
+                          alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
+                              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])},
+          range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])} {}
+
+    class iterator {
+      friend class elements_in_block;
+
+      ALPAKA_FN_ACC inline iterator(Idx local, Idx first, Idx range) : index_{local}, first_{first}, range_{range} {}
+
+    public:
+      ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < range_)
+            return *this;
+        }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        index_ = range_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // modified by the pre/post-increment operator
+      Idx index_;
+      // non-const to support iterator copy and assignment
+      Idx first_;
+      Idx range_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(local_, first_, range_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(range_, first_, range_); }
+
+  private:
+    const Idx first_;
+    const Idx local_;
+    const Idx range_;
+  };
+
 }  // namespace cms::alpakatools
 
 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h

From 42360abe6858e3c83ae67657df66285edd230bcf Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 6 Nov 2023 18:24:08 +0100
Subject: [PATCH 3/5] Add a test for blocks_with_stride and elements_in_block

---
 .../test/alpaka/testKernel.dev.cc             | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index 5866137f547f0..00127ba0bc625 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -56,6 +56,60 @@ struct VectorAddKernel3D {
   }
 };
 
+/* This is not an efficient approach; it is only a test of using dynamic shared memory,
+ * split block and element loops, and block-level synchronisation
+ */
+
+struct VectorAddBlockKernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // block size
+    auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+    // get the dynamic shared memory buffer
+    T* buffer = alpaka::getDynSharedMem<T>(acc);
+    // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
+    // the inner loop is needed for backends that use more than one element per thread
+    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+      // read the first set of data into shared memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        buffer[index.local] = in1[index.global];
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
+      // add the second set of data into shared memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        buffer[index.local] += in2[index.global];
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
+      // store the results into global memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        out[index.global] = buffer[index.local];
+      }
+    }
+  }
+};
+
+namespace alpaka::trait {
+  // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
+  // block shared dynamic memory for the VectorAddBlockKernel kernel
+  template <typename TAcc>
+  struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc> {
+    // the size in bytes of the shared memory allocated for a block
+    template <typename T>
+    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */,
+                                                                        Vec1D threads,
+                                                                        Vec1D elements,
+                                                                        T const* __restrict__ /* in1 */,
+                                                                        T const* __restrict__ /* in2 */,
+                                                                        T* __restrict__ /* out */,
+                                                                        size_t size) {
+      return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
+    }
+  };
+}  // namespace alpaka::trait
+
 // test the 1-dimensional kernel on all devices
 template <typename TKernel>
 void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
@@ -232,5 +286,15 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
     std::cout << "Test 3D vector addition with large block size\n";
     testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
   }
 }

From 6462fcfaf330222e18cc8673394f7fc0318ed470 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 13 Nov 2023 01:03:52 +0100
Subject: [PATCH 4/5] Add once_per_grid and once_per_block helper functions

`once_per_grid(acc)` returns true for a single thread within the kernel
execution grid. Usually the condition is true for block 0 and thread 0,
but these indices should not be relied upon.

`once_per_block(acc)` returns true for a single thread within the block.
Usually the condition is true for thread 0, but this index should not be
relied upon.
---
 .../AlpakaInterface/interface/workdivision.h  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 7f0c719148677..39f19fe463745 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -496,6 +496,30 @@ namespace cms::alpakatools {
     const Idx range_;
   };
 
+  /* once_per_grid
+   *
+   * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
+   *
+   * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
+    return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
+  }
+
+  /* once_per_block
+   *
+   * `once_per_block(acc)` returns true for a single thread within the block.
+   *
+   * Usually the condition is true for thread 0, but this index should not be relied upon.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
+    return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
+  }
+
 }  // namespace cms::alpakatools
 
 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h

From 8c859bc84150f089c1b393abaabbed230f445b34 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 14 Nov 2023 00:37:15 +0100
Subject: [PATCH 5/5] Add a test for once_per_grid and once_per_block

---
 .../test/alpaka/testKernel.dev.cc             | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index 00127ba0bc625..300f139b0c6e3 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -71,6 +71,14 @@ struct VectorAddBlockKernel {
     // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
     // the inner loop is needed for backends that use more than one element per thread
     for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+      // only one thread per block: initialise the shared memory
+      if (cms::alpakatools::once_per_block(acc)) {
+        // not really necessary, just to show how to use "once_per_block"
+        for (Idx local = 0; local < blockSize; ++local)
+          buffer[local] = 0.;
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
       // read the first set of data into shared memory
       for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
         buffer[index.local] = in1[index.global];
@@ -91,6 +99,49 @@ struct VectorAddBlockKernel {
   }
 };
 
+/* Run all operations in a single thread.
+ * Written in an inefficient way to test "once_per_grid".
+ */
+
+struct VectorAddKernelSerial {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // the operations are performed by a single thread
+    if (cms::alpakatools::once_per_grid(acc)) {
+      for (Idx index = 0; index < size; ++index) {
+        out[index] += in1[index];
+        out[index] += in2[index];
+      }
+    }
+  }
+};
+
+/* Run all operations in one thread per block.
+ * Written in an inefficient way to test "once_per_block".
+ */
+
+struct VectorAddKernelBlockSerial {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // block size
+    auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+    // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
+    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+      // the operations are performed by a single thread in each "logical" block
+      const auto first = blockSize * block;
+      const auto range = std::min<size_t>(first + blockSize, size);
+      if (cms::alpakatools::once_per_block(acc)) {
+        for (Idx index = first; index < range; ++index) {
+          out[index] += in1[index];
+          out[index] += in2[index];
+        }
+      }
+    }
+  }
+};
+
 namespace alpaka::trait {
   // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
   // block shared dynamic memory for the VectorAddBlockKernel kernel
@@ -296,5 +347,25 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
     std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
     testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
   }
 }