Skip to content

Commit

Permalink
Merge pull request #42559 from cms-patatrack/Alpaka_updates_13.0.x
Browse files Browse the repository at this point in the history
Fix elements_with_stride_nd when the index is outside the extent [13.2.x]
  • Loading branch information
cmsbuild authored Aug 15, 2023
2 parents 229771f + 938674c commit 53a2fda
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 27 deletions.
78 changes: 51 additions & 27 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ namespace cms::alpakatools {
struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
template <typename TDim>
struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

// Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
Expand Down Expand Up @@ -75,13 +80,13 @@ namespace cms::alpakatools {
public:
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{extent} {}

Expand All @@ -94,7 +99,7 @@ namespace cms::alpakatools {
extent_{extent},
first_{std::min(first, extent)},
index_{first_},
last_{std::min(first + elements, extent)} {}
range_{std::min(first + elements, extent)} {}

public:
ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
Expand All @@ -104,21 +109,21 @@ namespace cms::alpakatools {
if constexpr (requires_single_thread_per_block_v<TAcc>) {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
if (index_ < range_)
return *this;
}

// increment the thread index with the grid stride
first_ += stride_;
index_ = first_;
last_ = std::min(first_ + elements_, extent_);
range_ = std::min(first_ + elements_, extent_);
if (index_ < extent_)
return *this;

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_;
range_ = extent_;
return *this;
}

Expand All @@ -143,16 +148,16 @@ namespace cms::alpakatools {
// modified by the pre/post-increment operator
Idx first_;
Idx index_;
Idx last_;
Idx range_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Idx elements_;
const Idx first_;
const Idx thread_;
const Idx stride_;
const Idx extent_;
};
Expand All @@ -165,16 +170,19 @@ namespace cms::alpakatools {

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{extent} {}

// tag used to construct an end iterator
struct at_end_t {};

class iterator {
friend class elements_with_stride_nd;

Expand All @@ -199,19 +207,23 @@ namespace cms::alpakatools {
ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// private, explicit constructor
// construct an iterator pointing to the first element to be processed by the current thread
ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
: loop_{loop},
thread_{alpaka::elementwise_min(first, loop->extent_)},
first_{alpaka::elementwise_min(first, loop->extent_)},
range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
index_{thread_} {}
index_{first_} {}

// construct an end iterator, pointing post the end of the extent
ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&)
: loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}

template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
bool overflow = false;
++index_[I];
if (index_[I] >= range_[I]) {
index_[I] = thread_[I];
index_[I] = first_[I];
overflow = true;
}
return overflow;
Expand All @@ -234,13 +246,13 @@ namespace cms::alpakatools {
template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
bool overflow = false;
thread_[I] += loop_->stride_[I];
if (thread_[I] >= loop_->extent_[I]) {
thread_[I] = loop_->first_[I];
first_[I] += loop_->stride_[I];
if (first_[I] >= loop_->extent_[I]) {
first_[I] = loop_->thread_[I];
overflow = true;
}
index_[I] = thread_[I];
range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
index_[I] = first_[I];
range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
return overflow;
}

Expand Down Expand Up @@ -277,7 +289,7 @@ namespace cms::alpakatools {
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
thread_ = loop_->extent_;
first_ = loop_->extent_;
range_ = loop_->extent_;
index_ = loop_->extent_;
}
Expand All @@ -286,18 +298,30 @@ namespace cms::alpakatools {
const elements_with_stride_nd* loop_;

// modified by the pre/post-increment operator
Vec thread_; // first element processed by this thread
Vec range_; // last element processed by this thread
Vec index_; // current element processed by this thread
Vec first_; // first element processed by this thread
Vec range_; // last element processed by this thread
Vec index_; // current element processed by this thread
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
ALPAKA_FN_ACC inline iterator begin() const {
// check that all dimensions of the current thread index are within the extent
if ((thread_ < extent_).all()) {
// construct an iterator pointing to the first element to be processed by the current thread
return iterator{this, thread_};
} else {
// construct an end iterator, pointing post the end of the extent
return iterator{this, at_end_t{}};
}
}

ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
ALPAKA_FN_ACC inline iterator end() const {
// construct an end iterator, pointing post the end of the extent
return iterator{this, at_end_t{}};
}

private:
const Vec elements_;
const Vec first_;
const Vec thread_;
const Vec stride_;
const Vec extent_;
};
Expand Down
82 changes: 82 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ struct VectorAddKernel1D {
}
};

struct VectorAddKernel2D {
template <typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
auto index = ndindex[0] * size[1] + ndindex[1];
out[index] = in1[index] + in2[index];
}
}
};

struct VectorAddKernel3D {
template <typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(
Expand Down Expand Up @@ -136,6 +147,77 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag)
}
}

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) {
SECTION("VectorAddKernel2D") {
// get the list of devices on the current platform
auto const& devices = cms::alpakatools::devices<Platform>();
if (devices.empty()) {
std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
<< ", the test will be skipped.\n";
return;
}

// random number generator with a gaussian distribution
std::random_device rd{};
std::default_random_engine rand{rd()};
std::normal_distribution<float> dist{0., 1.};

// tolerance
constexpr float epsilon = 0.000001;

// 3-dimensional and linearised buffer size
constexpr Vec2D ndsize = {16, 16};
constexpr size_t size = ndsize.prod();

// allocate input and output host buffers in pinned memory accessible by the Platform devices
auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);

// fill the input buffers with random data, and the output buffer with zeros
for (size_t i = 0; i < size; ++i) {
in1_h[i] = dist(rand);
in2_h[i] = dist(rand);
out_h[i] = 0.;
}

// run the test on each device
for (auto const& device : devices) {
std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n';
auto queue = Queue(device);

// allocate input and output buffers on the device
auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);

// copy the input data to the device; the size is known from the buffer objects
alpaka::memcpy(queue, in1_d, in1_h);
alpaka::memcpy(queue, in2_d, in2_h);

// fill the output buffer with zeros; the size is known from the buffer objects
alpaka::memset(queue, out_d, 0.);

// launch the 3-dimensional kernel
auto div = cms::alpakatools::make_workdiv<Acc2D>({4, 4}, {32, 32});
alpaka::exec<Acc2D>(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);

// copy the results from the device to the host
alpaka::memcpy(queue, out_h, out_d);

// wait for all the operations to complete
alpaka::wait(queue);

// check the results
for (size_t i = 0; i < size; ++i) {
float sum = in1_h[i] + in2_h[i];
REQUIRE(out_h[i] < sum + epsilon);
REQUIRE(out_h[i] > sum - epsilon);
}
}
}
}

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
SECTION("VectorAddKernel3D") {
// get the list of devices on the current platform
Expand Down

0 comments on commit 53a2fda

Please sign in to comment.