Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix elements_with_stride_nd when the index is outside the extent [13.2.x] #42559

Merged
merged 2 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 51 additions & 27 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ namespace cms::alpakatools {
struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
template <typename TDim>
struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

// Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
Expand Down Expand Up @@ -75,13 +80,13 @@ namespace cms::alpakatools {
public:
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{extent} {}

Expand All @@ -94,7 +99,7 @@ namespace cms::alpakatools {
extent_{extent},
first_{std::min(first, extent)},
index_{first_},
last_{std::min(first + elements, extent)} {}
range_{std::min(first + elements, extent)} {}

public:
ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
Expand All @@ -104,21 +109,21 @@ namespace cms::alpakatools {
if constexpr (requires_single_thread_per_block_v<TAcc>) {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
if (index_ < range_)
return *this;
}

// increment the thread index with the grid stride
first_ += stride_;
index_ = first_;
last_ = std::min(first_ + elements_, extent_);
range_ = std::min(first_ + elements_, extent_);
if (index_ < extent_)
return *this;

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_;
range_ = extent_;
return *this;
}

Expand All @@ -143,16 +148,16 @@ namespace cms::alpakatools {
// modified by the pre/post-increment operator
Idx first_;
Idx index_;
Idx last_;
Idx range_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Idx elements_;
const Idx first_;
const Idx thread_;
const Idx stride_;
const Idx extent_;
};
Expand All @@ -165,16 +170,19 @@ namespace cms::alpakatools {

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{extent} {}

// tag used to construct an end iterator
struct at_end_t {};

class iterator {
friend class elements_with_stride_nd;

Expand All @@ -199,19 +207,23 @@ namespace cms::alpakatools {
ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// private, explicit constructor
// construct an iterator pointing to the first element to be processed by the current thread
ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
: loop_{loop},
thread_{alpaka::elementwise_min(first, loop->extent_)},
first_{alpaka::elementwise_min(first, loop->extent_)},
range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
index_{thread_} {}
index_{first_} {}

// construct an end iterator, pointing post the end of the extent
ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&)
: loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}

template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
bool overflow = false;
++index_[I];
if (index_[I] >= range_[I]) {
index_[I] = thread_[I];
index_[I] = first_[I];
overflow = true;
}
return overflow;
Expand All @@ -234,13 +246,13 @@ namespace cms::alpakatools {
template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
bool overflow = false;
thread_[I] += loop_->stride_[I];
if (thread_[I] >= loop_->extent_[I]) {
thread_[I] = loop_->first_[I];
first_[I] += loop_->stride_[I];
if (first_[I] >= loop_->extent_[I]) {
first_[I] = loop_->thread_[I];
overflow = true;
}
index_[I] = thread_[I];
range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
index_[I] = first_[I];
range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
return overflow;
}

Expand Down Expand Up @@ -277,7 +289,7 @@ namespace cms::alpakatools {
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
thread_ = loop_->extent_;
first_ = loop_->extent_;
range_ = loop_->extent_;
index_ = loop_->extent_;
}
Expand All @@ -286,18 +298,30 @@ namespace cms::alpakatools {
const elements_with_stride_nd* loop_;

// modified by the pre/post-increment operator
Vec thread_; // first element processed by this thread
Vec range_; // last element processed by this thread
Vec index_; // current element processed by this thread
Vec first_; // first element processed by this thread
Vec range_; // last element processed by this thread
Vec index_; // current element processed by this thread
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
ALPAKA_FN_ACC inline iterator begin() const {
// check that all dimensions of the current thread index are within the extent
if ((thread_ < extent_).all()) {
// construct an iterator pointing to the first element to be processed by the current thread
return iterator{this, thread_};
} else {
// construct an end iterator, pointing post the end of the extent
return iterator{this, at_end_t{}};
}
}

ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
ALPAKA_FN_ACC inline iterator end() const {
// construct an end iterator, pointing post the end of the extent
return iterator{this, at_end_t{}};
}

private:
const Vec elements_;
const Vec first_;
const Vec thread_;
const Vec stride_;
const Vec extent_;
};
Expand Down
82 changes: 82 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ struct VectorAddKernel1D {
}
};

struct VectorAddKernel2D {
template <typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec2D size) const {
for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
auto index = ndindex[0] * size[1] + ndindex[1];
out[index] = in1[index] + in2[index];
}
}
};

struct VectorAddKernel3D {
template <typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(
Expand Down Expand Up @@ -136,6 +147,77 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag)
}
}

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) {
SECTION("VectorAddKernel2D") {
// get the list of devices on the current platform
auto const& devices = cms::alpakatools::devices<Platform>();
if (devices.empty()) {
std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
<< ", the test will be skipped.\n";
return;
}

// random number generator with a gaussian distribution
std::random_device rd{};
std::default_random_engine rand{rd()};
std::normal_distribution<float> dist{0., 1.};

// tolerance
constexpr float epsilon = 0.000001;

// 3-dimensional and linearised buffer size
constexpr Vec2D ndsize = {16, 16};
constexpr size_t size = ndsize.prod();

// allocate input and output host buffers in pinned memory accessible by the Platform devices
auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);

// fill the input buffers with random data, and the output buffer with zeros
for (size_t i = 0; i < size; ++i) {
in1_h[i] = dist(rand);
in2_h[i] = dist(rand);
out_h[i] = 0.;
}

// run the test on each device
for (auto const& device : devices) {
std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n';
auto queue = Queue(device);

// allocate input and output buffers on the device
auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);

// copy the input data to the device; the size is known from the buffer objects
alpaka::memcpy(queue, in1_d, in1_h);
alpaka::memcpy(queue, in2_d, in2_h);

// fill the output buffer with zeros; the size is known from the buffer objects
alpaka::memset(queue, out_d, 0.);

// launch the 3-dimensional kernel
auto div = cms::alpakatools::make_workdiv<Acc2D>({4, 4}, {32, 32});
alpaka::exec<Acc2D>(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);

// copy the results from the device to the host
alpaka::memcpy(queue, out_h, out_d);

// wait for all the operations to complete
alpaka::wait(queue);

// check the results
for (size_t i = 0; i < size; ++i) {
float sum = in1_h[i] + in2_h[i];
REQUIRE(out_h[i] < sum + epsilon);
REQUIRE(out_h[i] > sum - epsilon);
}
}
}
}

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
SECTION("VectorAddKernel3D") {
// get the list of devices on the current platform
Expand Down