diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 0d295855976da..7449bb153c9f7 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -32,6 +32,11 @@ namespace cms::alpakatools { struct requires_single_thread_per_block> : public std::false_type {}; #endif // ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + template + struct requires_single_thread_per_block> : public std::false_type {}; +#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped template >> inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block::value; @@ -75,13 +80,13 @@ namespace cms::alpakatools { public: ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) : elements_{alpaka::getWorkDiv(acc)[0u]}, - first_{alpaka::getIdx(acc)[0u] * elements_}, + thread_{alpaka::getIdx(acc)[0u] * elements_}, stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{stride_} {} ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent) : elements_{alpaka::getWorkDiv(acc)[0u]}, - first_{alpaka::getIdx(acc)[0u] * elements_}, + thread_{alpaka::getIdx(acc)[0u] * elements_}, stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{extent} {} @@ -94,7 +99,7 @@ namespace cms::alpakatools { extent_{extent}, first_{std::min(first, extent)}, index_{first_}, - last_{std::min(first + elements, extent)} {} + range_{std::min(first + elements, extent)} {} public: ALPAKA_FN_ACC inline Idx operator*() const { return index_; } @@ -104,21 +109,21 @@ namespace cms::alpakatools { if constexpr (requires_single_thread_per_block_v) { // increment the index along the elements processed by the current thread ++index_; - if (index_ < last_) + if (index_ < range_) return *this; } // increment the thread index with the grid stride first_ += stride_; index_ = first_; - last_ = std::min(first_ + elements_, extent_); + range_ = std::min(first_ + elements_, extent_); if (index_ < extent_) return *this; // the iterator has reached or passed the end of the extent, clamp it to the extent first_ = extent_; index_ = extent_; - last_ = extent_; + range_ = extent_; return *this; } @@ -143,16 +148,16 @@ namespace cms::alpakatools { // modified by the pre/post-increment operator Idx first_; Idx index_; - Idx last_; + Idx range_; }; - ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); } ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } private: const Idx elements_; - const Idx first_; + const Idx thread_; const Idx stride_; const Idx extent_; }; @@ -165,16 +170,19 @@ namespace cms::alpakatools { ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc) : elements_{alpaka::getWorkDiv(acc)}, - first_{alpaka::getIdx(acc) * elements_}, + thread_{alpaka::getIdx(acc) * elements_}, stride_{alpaka::getWorkDiv(acc) * elements_}, extent_{stride_} {} ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent) : elements_{alpaka::getWorkDiv(acc)}, - first_{alpaka::getIdx(acc) * elements_}, + thread_{alpaka::getIdx(acc) * elements_}, stride_{alpaka::getWorkDiv(acc) * elements_}, extent_{extent} {} + // tag used to construct an end iterator + struct at_end_t {}; + class iterator { friend class elements_with_stride_nd; @@ -199,19 +207,23 @@ namespace cms::alpakatools { ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); } private: - // private, explicit constructor + // construct an iterator pointing to the first element to be processed by the current thread ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first) : loop_{loop}, - thread_{alpaka::elementwise_min(first, loop->extent_)}, + first_{alpaka::elementwise_min(first, loop->extent_)}, range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, - index_{thread_} {} + index_{first_} {} + + // construct an end iterator, pointing post the end of the extent + ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&) + : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {} template ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { bool overflow = false; ++index_[I]; if (index_[I] >= range_[I]) { - index_[I] = thread_[I]; + index_[I] = first_[I]; overflow = true; } return overflow; @@ -234,13 +246,13 @@ namespace cms::alpakatools { template ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { bool overflow = false; - thread_[I] += loop_->stride_[I]; - if (thread_[I] >= loop_->extent_[I]) { - thread_[I] = loop_->first_[I]; + first_[I] += loop_->stride_[I]; + if (first_[I] >= loop_->extent_[I]) { + first_[I] = loop_->thread_[I]; overflow = true; } - index_[I] = thread_[I]; - range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]); + index_[I] = first_[I]; + range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]); return overflow; } @@ -277,7 +289,7 @@ namespace cms::alpakatools { } // the iterator has reached or passed the end of the extent, clamp it to the extent - thread_ = loop_->extent_; + first_ = loop_->extent_; range_ = loop_->extent_; index_ = loop_->extent_; } @@ -286,18 +298,30 @@ namespace cms::alpakatools { const elements_with_stride_nd* loop_; // modified by the pre/post-increment operator - Vec thread_; // first element processed by this thread - Vec range_; // last element processed by this thread - Vec index_; // current element processed by this thread + Vec first_; // first element processed by this thread + Vec range_; // last element processed by this thread + Vec index_; // current element processed by this thread }; - ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; } + ALPAKA_FN_ACC inline iterator begin() const { + // check that all dimensions of the current thread index are within the extent + if ((thread_ < extent_).all()) { + // construct an iterator pointing to the first element to be processed by the current thread + return iterator{this, thread_}; + } else { + // construct an end iterator, pointing post the end of the extent + return iterator{this, at_end_t{}}; + } + } - ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; } + ALPAKA_FN_ACC inline iterator end() const { + // construct an end iterator, pointing post the end of the extent + return iterator{this, at_end_t{}}; + } private: const Vec elements_; - const Vec first_; + const Vec thread_; const Vec stride_; const Vec extent_; }; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc index 11803a3443737..c35965fa8793b 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc @@ -36,6 +36,17 @@ struct VectorAddKernel1D { } }; +struct VectorAddKernel2D { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const { + for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) { + auto index = ndindex[0] * size[1] + ndindex[1]; + out[index] = in1[index] + in2[index]; + } + } +}; + struct VectorAddKernel3D { template ALPAKA_FN_ACC void operator()( @@ -136,6 +147,77 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) } } +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) { + SECTION("VectorAddKernel2D") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // 3-dimensional and linearised buffer size + constexpr Vec2D ndsize = {16, 16}; + constexpr size_t size = ndsize.prod(); + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); + + // launch the 3-dimensional kernel + auto div = cms::alpakatools::make_workdiv({4, 4}, {32, 32}); + alpaka::exec(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); + } + } + } +} + TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) { SECTION("VectorAddKernel3D") { // get the list of devices on the current platform