From e25884b43f605969fc985be2e81614ec741d237d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 19 Dec 2023 21:37:18 +0100
Subject: [PATCH 1/5] Add uniform_groups and uniform_group_elements type
 aliases

---
 .../AlpakaInterface/interface/workdivision.h  | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 39f19fe463745..7e181363b1290 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -496,6 +496,46 @@ namespace cms::alpakatools {
     const Idx range_;
   };
 
+  /* uniform_groups
+   *
+   * `uniform_groups(acc, elements)` returns a range than spans the group indices required to cover the given problem
+   * size, in units of the block size:
+   *   - the `elements` argument indicates the total number of elements, across all groups.
+   *
+   * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
+   * loop iterations, while threads in different blocks may see a different number of iterations.
+   *
+   * For example, if `size` is 1000 and the block size is 16,
+   *
+   *   for (auto group: uniform_groups(acc, 1000)
+   *
+   * will return the range from 0 to 62, split across all blocks in the work division.
+   *
+   * If the work division has more than 63 blocks, the first 63 will perform one iteration of the loop, while the other
+   * blocks will exit immediately.
+   * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
+   * cover then whole problem space.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  using uniform_groups = blocks_with_stride<TAcc>;
+
+  /* uniform_group_elements
+   *
+   * `uniform_group_elements(acc, group, elements)` returns a range that spans all the elements within the given group:
+   *   - the `group` argument indicates the id of the current group, for example as obtained from `uniform_groups`;
+   *   - the `elements` argument indicates the total number of elements, across all groups.
+   *
+   * Iterating over the range yields values of type `ElementIndex`, that contain the `.global` and `.local` indices of
+   * the corresponding element.
+   *
+   * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier when the
+   * element index reaches `size`.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  using uniform_group_elements = elements_in_block<TAcc>;
+
   /* once_per_grid
    *
    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.

From 6844aadf7892fe598a43fa8a31c99e1c5605edcd Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 21 Dec 2023 00:55:38 +0100
Subject: [PATCH 2/5] Add independent_groups and independent_group_elements
 helper classes

  - `independent_groups(acc, groups)` returns a range than spans the group
    indices from 0 to `groups`, with one group per block;

  - `independent_group_elements(acc, elements)` returns a range that spans all
    the elements within the given group, from 0 to `elements`.
---
 .../AlpakaInterface/interface/workdivision.h  | 173 ++++++++++++++++++
 1 file changed, 173 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 7e181363b1290..220abe46b1925 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -536,6 +536,179 @@ namespace cms::alpakatools {
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   using uniform_group_elements = elements_in_block<TAcc>;
 
+  /* independent_groups
+   *
+   * `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group
+   * per block:
+   *   - the `groups` argument indicates the total number of groups.
+   *
+   * If the work division has more blocks than `groups`, only the first `groups` blocks will perform one iteration of
+   * the loop, while the other blocks will exit immediately.
+   * If the work division has less blocks than `groups`, some of the blocks will perform more than one iteration, in
+   * order to cover then whole problem space.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class independent_groups {
+  public:
+    ALPAKA_FN_ACC inline independent_groups(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{stride_} {}
+
+    // extent is the total number of elements (not blocks)
+    ALPAKA_FN_ACC inline independent_groups(TAcc const& acc, Idx groups)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{groups} {}
+
+    class iterator {
+      friend class independent_groups;
+
+      ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
+          : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the first-element-in-block index by the grid stride
+        first_ += stride_;
+        if (first_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }
+
+  private:
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+  /* independent_group_elements
+   *
+   * `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group:
+   *   - the `elements` argument indicates the number of elements in the current group.
+   *
+   * Iterating over the range yields the local element index, between `0` and `elements - 1`. The threads in the block
+   * will perform one or more iterations, depending on the number of elements per thread, and on the number of threads
+   * per block, ocmpared with the total number of elements.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class independent_group_elements {
+  public:
+    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
+    class iterator {
+      friend class independent_group_elements;
+
+      ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{std::min(first, extent)},
+            index_{first_},
+            range_{std::min(first + elements, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < range_)
+            return *this;
+        }
+
+        // increment the thread index with the block stride
+        first_ += stride_;
+        index_ = first_;
+        range_ = std::min(first_ + elements_, extent_);
+        if (index_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        range_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx elements_;
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+      Idx index_;
+      Idx range_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+
+  private:
+    const Idx elements_;
+    const Idx thread_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
   /* once_per_grid
    *
    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.

From 3dfb1a9f97c149aea43cd6206626ad188a0b1e06 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 21 Dec 2023 00:58:05 +0100
Subject: [PATCH 3/5] Add a test for independent_groups and
 independent_group_elements

---
 .../AlpakaInterface/test/BuildFile.xml        |   7 +
 .../test/alpaka/testIndependentKernel.dev.cc  | 144 ++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc

diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 5f9c5fe81981f..2d204819d740b 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -12,6 +12,13 @@
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
 
+<bin name="alpakaTestIndependentKernel" file="alpaka/testIndependentKernel.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
 <bin name="alpakaTestBackend" file="testBackend.cc">
   <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc
new file mode 100644
index 0000000000000..bd98efcfa32d6
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc
@@ -0,0 +1,144 @@
+#include <cstdio>
+#include <random>
+
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+// each test binary is built for a single Alpaka backend
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+/* Add the group id to te value of each element in the group.
+ * Each group is composed by the elements first[group]..first[group+1]-1 .
+ */
+struct IndependentWorkKernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                T const* __restrict__ in,
+                                T* __restrict__ out,
+                                size_t const* __restrict__ indices,
+                                size_t groups) const {
+    for (auto group : cms::alpakatools::independent_groups(acc, groups)) {
+      size_t first = indices[group];
+      size_t last = indices[group + 1];
+      size_t size = last - first;
+      for (auto index : cms::alpakatools::independent_group_elements(acc, size)) {
+        out[first + index] = in[first + index] + group;
+      }
+    }
+  }
+};
+
+/* Test the IndependentWorkKernel kernel on all devices
+ */
+template <typename TKernel>
+void testIndependentWorkKernel(size_t groups, size_t grid_size, size_t block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine engine{rd()};
+
+  // uniform distribution
+  std::uniform_int_distribution<size_t> random_size{100, 201};
+
+  // gaussian distribution
+  std::normal_distribution<float> dist{0., 1.};
+
+  // build the groups
+  std::vector<size_t> sizes(groups);
+  auto indices_h = cms::alpakatools::make_host_buffer<size_t[], Platform>(groups + 1);
+  indices_h[0] = 0;
+  for (size_t i = 0; i < groups; ++i) {
+    auto size = random_size(engine);
+    sizes[i] = size;
+    indices_h[i + 1] = indices_h[i] + size;
+  }
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  const size_t size = indices_h[groups];
+
+  // allocate the input and output host buffer in pinned memory accessible by the Platform devices
+  auto in_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in_h[i] = dist(engine);
+    out_h[i] = 0;
+  }
+
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test IndependentWorkKernel on " << alpaka::getName(device) << " over " << size << " elements in "
+              << groups << " independent groups with " << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
+
+    // allocate input and output buffers on the device
+    auto indices_d = cms::alpakatools::make_device_buffer<size_t[]>(queue, groups + 1);
+    auto in_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, indices_d, indices_h);
+    alpaka::memcpy(queue, in_d, in_h);
+
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
+
+    // launch the 1-dimensional kernel with independent work groups
+    auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::exec<Acc1D>(queue, div, kernel, in_d.data(), out_d.data(), indices_d.data(), groups);
+
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
+
+    // wait for all the operations to complete
+    alpaka::wait(queue);
+
+    // check the results
+    for (size_t g = 0; g < groups; ++g) {
+      size_t first = indices_h[g];
+      size_t last = indices_h[g + 1];
+      for (size_t i = first; i < last; ++i) {
+        float sum = in_h[i] + g;
+        float delta = std::max(std::fabs(sum) * epsilon, epsilon);
+        REQUIRE(out_h[i] < sum + delta);
+        REQUIRE(out_h[i] > sum - delta);
+      }
+    }
+  }
+}
+
+TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  SECTION("Independent work groups") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    if (devices.empty()) {
+      INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
+      REQUIRE(not devices.empty());
+    }
+
+    // launch the independent work kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test independent work kernel with small block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 32, 32, IndependentWorkKernel{});
+
+    // launch the independent work kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 1, 1024, IndependentWorkKernel{});
+
+    // launch the independent work kernel with a large block size and a large number of blocks;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 1024, 1024, IndependentWorkKernel{});
+  }
+}

From ce64e0a4583ff6d9660861d2cd83ea1355d6fd4c Mon Sep 17 00:00:00 2001
From: Eric Cano <eric.cano@cern.ch>
Date: Thu, 21 Dec 2023 11:44:38 +0100
Subject: [PATCH 4/5] Add support for shifted elements_with_stride loop in
 range for

---
 .../AlpakaInterface/interface/workdivision.h  | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 220abe46b1925..0433980e7c6d6 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -86,6 +86,11 @@ namespace cms::alpakatools {
   };
 
   /* elements_with_stride
+   *
+   * `elements_with_stride(acc, [first, ]extent)` returns an iteratable range that spans the element indices required to
+   * cover the given problem size:
+   *   - `first` (optional) is index to the first element; if not specified, the loop starts from 0;
+   *   - `extent` is the total size of the problem, including any elements that may come before `first`.
    */
 
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
@@ -93,13 +98,19 @@ namespace cms::alpakatools {
   public:
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{stride_} {}
 
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
+    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_ + first},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}
 
@@ -164,13 +175,13 @@ namespace cms::alpakatools {
       Idx range_;
     };
 
-    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
 
     ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
 
   private:
     const Idx elements_;
-    const Idx thread_;
+    const Idx first_;
     const Idx stride_;
     const Idx extent_;
   };

From d5e466bc9428bdb2c33f8df1b6ddfb52dc220788 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 21 Dec 2023 12:08:33 +0100
Subject: [PATCH 5/5] Add tests for the shifted elements_with_stride

---
 .../test/alpaka/testKernel.dev.cc             | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index 300f139b0c6e3..a730e4b515a76 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -23,6 +23,20 @@ struct VectorAddKernel {
   }
 };
 
+struct VectorAddKernelSkip {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                T const* __restrict__ in1,
+                                T const* __restrict__ in2,
+                                T* __restrict__ out,
+                                size_t first,
+                                size_t size) const {
+    for (auto index : cms::alpakatools::elements_with_stride(acc, first, size)) {
+      out[index] = in1[index] + in2[index];
+    }
+  }
+};
+
 struct VectorAddKernel1D {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
@@ -224,6 +238,76 @@ void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::s
   }
 }
 
+// test the 1-dimensional kernel on all devices, potentially skipping some elements
+template <typename TKernel>
+void testVectorAddKernelSkip(std::size_t skip_elements,
+                             std::size_t problem_size,
+                             std::size_t grid_size,
+                             std::size_t block_size,
+                             TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  const size_t size = problem_size;
+
+  // allocate input and output host buffers in pinned memory accessible by the Platform devices
+  auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
+
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " skipping " << skip_elements << " over "
+              << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
+
+    // allocate input and output buffers on the device
+    auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, in1_d, in1_h);
+    alpaka::memcpy(queue, in2_d, in2_h);
+
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
+
+    // launch the 1-dimensional kernel with scalar size
+    auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), skip_elements, size);
+
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
+
+    // wait for all the operations to complete
+    alpaka::wait(queue);
+
+    // check the results
+    for (size_t i = 0; i < skip_elements; ++i) {
+      REQUIRE(out_h[i] == 0);
+    }
+    for (size_t i = skip_elements; i < size; ++i) {
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
+    }
+  }
+}
+
 // test the N-dimensional kernels on all devices
 template <typename TDim, typename TKernel>
 void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
@@ -367,5 +451,15 @@ TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESP
     // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
     std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
     testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
+    testVectorAddKernelSkip(20, 10000, 32, 32, VectorAddKernelSkip{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
+    testVectorAddKernelSkip(20, 100, 1, 1024, VectorAddKernelSkip{});
   }
 }