Merge pull request #42560 from cms-patatrack/Alpaka_updates_13.0.x

Fix elements_with_stride_nd when the index is outside the extent [13.1.x]
cms-sw · Aug 15, 2023 · 2d3f917 · 2d3f917
2 parents 098887e + 938674c
commit 2d3f917
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 27 deletions.
diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -32,6 +32,11 @@ namespace cms::alpakatools {
   struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
 
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+  template <typename TDim>
+  struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
   // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
@@ -75,13 +80,13 @@ namespace cms::alpakatools {
   public:
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{stride_} {}
 
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}
 
@@ -94,7 +99,7 @@ namespace cms::alpakatools {
             extent_{extent},
             first_{std::min(first, extent)},
             index_{first_},
-            last_{std::min(first + elements, extent)} {}
+            range_{std::min(first + elements, extent)} {}
 
     public:
       ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
@@ -104,21 +109,21 @@ namespace cms::alpakatools {
         if constexpr (requires_single_thread_per_block_v<TAcc>) {
           // increment the index along the elements processed by the current thread
           ++index_;
-          if (index_ < last_)
+          if (index_ < range_)
             return *this;
         }
 
         // increment the thread index with the grid stride
         first_ += stride_;
         index_ = first_;
-        last_ = std::min(first_ + elements_, extent_);
+        range_ = std::min(first_ + elements_, extent_);
         if (index_ < extent_)
           return *this;
 
         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = extent_;
         index_ = extent_;
-        last_ = extent_;
+        range_ = extent_;
         return *this;
       }
 
@@ -143,16 +148,16 @@ namespace cms::alpakatools {
       // modified by the pre/post-increment operator
       Idx first_;
       Idx index_;
-      Idx last_;
+      Idx range_;
     };
 
-    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }
 
     ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
 
   private:
     const Idx elements_;
-    const Idx first_;
+    const Idx thread_;
     const Idx stride_;
     const Idx extent_;
   };
@@ -165,16 +170,19 @@ namespace cms::alpakatools {
 
     ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           extent_{stride_} {}
 
     ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
-          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           extent_{extent} {}
 
+    // tag used to construct an end iterator
+    struct at_end_t {};
+
     class iterator {
       friend class elements_with_stride_nd;
 
@@ -199,19 +207,23 @@ namespace cms::alpakatools {
       ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }
 
     private:
-      // private, explicit constructor
+      // construct an iterator pointing to the first element to be processed by the current thread
       ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
           : loop_{loop},
-            thread_{alpaka::elementwise_min(first, loop->extent_)},
+            first_{alpaka::elementwise_min(first, loop->extent_)},
             range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
-            index_{thread_} {}
+            index_{first_} {}
+
+      // construct an end iterator, pointing post the end of the extent
+      ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&)
+          : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
 
       template <size_t I>
       ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
         bool overflow = false;
         ++index_[I];
         if (index_[I] >= range_[I]) {
-          index_[I] = thread_[I];
+          index_[I] = first_[I];
           overflow = true;
         }
         return overflow;
@@ -234,13 +246,13 @@ namespace cms::alpakatools {
       template <size_t I>
       ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
         bool overflow = false;
-        thread_[I] += loop_->stride_[I];
-        if (thread_[I] >= loop_->extent_[I]) {
-          thread_[I] = loop_->first_[I];
+        first_[I] += loop_->stride_[I];
+        if (first_[I] >= loop_->extent_[I]) {
+          first_[I] = loop_->thread_[I];
           overflow = true;
         }
-        index_[I] = thread_[I];
-        range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
+        index_[I] = first_[I];
+        range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
         return overflow;
       }
 
@@ -277,7 +289,7 @@ namespace cms::alpakatools {
         }
 
         // the iterator has reached or passed the end of the extent, clamp it to the extent
-        thread_ = loop_->extent_;
+        first_ = loop_->extent_;
         range_ = loop_->extent_;
         index_ = loop_->extent_;
       }
@@ -286,18 +298,30 @@ namespace cms::alpakatools {
       const elements_with_stride_nd* loop_;
 
       // modified by the pre/post-increment operator
-      Vec thread_;  // first element processed by this thread
-      Vec range_;   // last element processed by this thread
-      Vec index_;   // current element processed by this thread
+      Vec first_;  // first element processed by this thread
+      Vec range_;  // last element processed by this thread
+      Vec index_;  // current element processed by this thread
     };
 
-    ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
+    ALPAKA_FN_ACC inline iterator begin() const {
+      // check that all dimensions of the current thread index are within the extent
+      if ((thread_ < extent_).all()) {
+        // construct an iterator pointing to the first element to be processed by the current thread
+        return iterator{this, thread_};
+      } else {
+        // construct an end iterator, pointing post the end of the extent
+        return iterator{this, at_end_t{}};
+      }
+    }
 
-    ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
+    ALPAKA_FN_ACC inline iterator end() const {
+      // construct an end iterator, pointing post the end of the extent
+      return iterator{this, at_end_t{}};
+    }
 
   private:
     const Vec elements_;
-    const Vec first_;
+    const Vec thread_;
     const Vec stride_;
     const Vec extent_;
   };

diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -36,6 +36,17 @@ struct VectorAddKernel1D {
   }
 };
 
+struct VectorAddKernel2D {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
+    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+      auto index = ndindex[0] * size[1] + ndindex[1];
+      out[index] = in1[index] + in2[index];
+    }
+  }
+};
+
 struct VectorAddKernel3D {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
@@ -136,6 +147,77 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag)
   }
 }
 
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) {
+  SECTION("VectorAddKernel2D") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      return;
+    }
+
+    // random number generator with a gaussian distribution
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0., 1.};
+
+    // tolerance
+    constexpr float epsilon = 0.000001;
+
+    // 3-dimensional and linearised buffer size
+    constexpr Vec2D ndsize = {16, 16};
+    constexpr size_t size = ndsize.prod();
+
+    // allocate input and output host buffers in pinned memory accessible by the Platform devices
+    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+    // fill the input buffers with random data, and the output buffer with zeros
+    for (size_t i = 0; i < size; ++i) {
+      in1_h[i] = dist(rand);
+      in2_h[i] = dist(rand);
+      out_h[i] = 0.;
+    }
+
+    // run the test on each device
+    for (auto const& device : devices) {
+      std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n';
+      auto queue = Queue(device);
+
+      // allocate input and output buffers on the device
+      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+      // copy the input data to the device; the size is known from the buffer objects
+      alpaka::memcpy(queue, in1_d, in1_h);
+      alpaka::memcpy(queue, in2_d, in2_h);
+
+      // fill the output buffer with zeros; the size is known from the buffer objects
+      alpaka::memset(queue, out_d, 0.);
+
+      // launch the 3-dimensional kernel
+      auto div = cms::alpakatools::make_workdiv<Acc2D>({4, 4}, {32, 32});
+      alpaka::exec<Acc2D>(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
+
+      // copy the results from the device to the host
+      alpaka::memcpy(queue, out_h, out_d);
+
+      // wait for all the operations to complete
+      alpaka::wait(queue);
+
+      // check the results
+      for (size_t i = 0; i < size; ++i) {
+        float sum = in1_h[i] + in2_h[i];
+        REQUIRE(out_h[i] < sum + epsilon);
+        REQUIRE(out_h[i] > sum - epsilon);
+      }
+    }
+  }
+}
+
 TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
   SECTION("VectorAddKernel3D") {
     // get the list of devices on the current platform