Skip to content

Commit

Permalink
Merge pull request #39547 from fwyzard/alpaka_framework_part2
Browse files Browse the repository at this point in the history
Add helper functions for alpaka kernels [12.5.x]
  • Loading branch information
cmsbuild authored Oct 4, 2022
2 parents dba368b + 355d9d3 commit 980f9a3
Show file tree
Hide file tree
Showing 5 changed files with 357 additions and 10 deletions.
42 changes: 42 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/vec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h
#define HeterogeneousCore_AlpakaInterface_interface_vec_h

#include <type_traits>

#include <alpaka/alpaka.hpp>

namespace alpaka {

//! \return The element-wise minimum of one or more vectors.
ALPAKA_NO_HOST_ACC_WARNING
template <typename TDim,
typename TVal,
typename... Vecs,
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
Vec<TDim, TVal> r;
if constexpr (TDim::value > 0) {
for (typename TDim::value_type i = 0; i < TDim::value; ++i)
r[i] = std::min({p[i], qs[i]...});
}
return r;
}

//! \return The element-wise maximum of one or more vectors.
ALPAKA_NO_HOST_ACC_WARNING
template <typename TDim,
typename TVal,
typename... Vecs,
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
Vec<TDim, TVal> r;
if constexpr (TDim::value > 0) {
for (typename TDim::value_type i = 0; i < TDim::value; ++i)
r[i] = std::max({p[i], qs[i]...});
}
return r;
}

} // namespace alpaka

#endif // HeterogeneousCore_AlpakaInterface_interface_vec_h
266 changes: 266 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h

#include <type_traits>

#include <alpaka/alpaka.hpp>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"

namespace cms::alpakatools {

using namespace alpaka_common;

// If the first argument is not a multiple of the second argument, round it up to the next multiple
inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }

// Return the integer division of the first argument by the second argument, rounded up to the next integer
inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }

// Create an accelerator-dependent work division for 1-dimensional kernels
template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#if ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
}
}

// Create the accelerator-dependent workdiv for N-dimensional kernels
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
const Vec<alpaka::Dim<TAcc>>& elements) {
using Dim = alpaka::Dim<TAcc>;
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
}
}

template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
class elements_with_stride {
public:
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{extent} {}

class iterator {
friend class elements_with_stride;

ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
: elements_{elements},
stride_{stride},
extent_{extent},
first_{std::min(first, extent)},
index_{first_},
last_{std::min(first + elements, extent)} {}

public:
ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
return *this;

// increment the thread index with the grid stride
first_ += stride_ * elements_;
index_ = first_;
last_ = std::min(first_ + elements_, extent_);
if (index_ < extent_)
return *this;

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_;
return *this;
}

// post-increment the iterator
ALPAKA_FN_ACC inline iterator operator++(int) {
iterator old = *this;
++(*this);
return old;
}

ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
return (index_ == other.index_) and (first_ == other.first_);
}

ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// non-const to support iterator copy and assignment
Idx elements_;
Idx stride_;
Idx extent_;
// modified by the pre/post-increment operator
Idx first_;
Idx index_;
Idx last_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Idx elements_;
const Idx first_;
const Idx stride_;
const Idx extent_;
};

template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
class elements_with_stride_nd {
public:
using Dim = alpaka::Dim<TAcc>;
using Vec = alpaka::Vec<Dim, Idx>;

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{extent} {}

class iterator {
friend class elements_with_stride_nd;
constexpr static const auto last_dimension = Dim::value - 1;

ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
: elements_{elements},
stride_{stride},
extent_{extent},
first_{alpaka::elementwise_min(first, extent)},
index_{first_},
last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}

public:
ALPAKA_FN_ACC inline Vec operator*() const { return index_; }

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_[last_dimension];
if (index_[last_dimension] < last_)
return *this;

// increment the thread index along with the last dimension with the grid stride
first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
index_[last_dimension] = first_[last_dimension];
last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
if (index_[last_dimension] < extent_[last_dimension])
return *this;

// increment the thread index along the outer dimensions with the grid stride
if constexpr (last_dimension > 0)
for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
first_[dimension] += stride_[dimension];
index_[dimension] = first_[dimension];
if (index_[dimension] < extent_[dimension])
return *this;
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_[last_dimension];
return *this;
}

// post-increment the iterator
ALPAKA_FN_ACC inline iterator operator++(int) {
iterator old = *this;
++(*this);
return old;
}

ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
return (index_ == other.index_) and (first_ == other.first_);
}

ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// non-const to support iterator copy and assignment
Vec elements_;
Vec stride_;
Vec extent_;
// modified by the pre/post-increment operator
Vec first_;
Vec index_;
Idx last_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Vec elements_;
const Vec first_;
const Vec stride_;
const Vec extent_;
};

} // namespace cms::alpakatools

#endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
6 changes: 6 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<bin name="alpakaTestVec" file="alpaka/testVec.cc">
<use name="alpaka"/>
<use name="catch2"/>
<use name="HeterogeneousCore/AlpakaInterface"/>
<flags ALPAKA_BACKENDS="1"/>
</bin>
25 changes: 25 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include <alpaka/alpaka.hpp>

#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"

static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec) "]";

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec), s_tag) {
SECTION("Vec1D") {
alpaka_common::Vec1D s1 = 100;
alpaka_common::Vec1D s2 = 42;
alpaka_common::Vec1D si = 42;
REQUIRE(alpaka::elementwise_min(s1, s2) == si);
}

SECTION("Vec3D") {
alpaka_common::Vec3D v1{100, 10, 10};
alpaka_common::Vec3D v2{42, 42, 1};
alpaka_common::Vec3D vi{42, 10, 1};
REQUIRE(alpaka::elementwise_min(v1, v2) == vi);
}
}
28 changes: 18 additions & 10 deletions HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,45 @@
#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"

#include "TestAlgo.h"

namespace ALPAKA_ACCELERATOR_NAMESPACE {

using namespace cms::alpakatools;

class TestAlgoKernel {
public:
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>>
ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const {
// this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread
// global index of the thread within the grid
const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};

// set this only once in the whole kernel grid
if (thread == 0) {
view.r() = 1.;
}
for (auto i = thread; i < size; i += stride) {

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, size)) {
view[i] = {0., 0., 0., i, matrix * i};
}
}
};

void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const {
auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue));
uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0];
// use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
uint32_t items = 64;

// use as many groups as needed to cover the whole problem
uint32_t groups = divide_up_by(collection->metadata().size(), items);

uint32_t threadsPerBlock = maxThreadsPerBlock;
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
uint32_t elementsPerThread = 1;
auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread};
// map items to
// - threads with a single element per thread on a GPU backend
// - elements within a single thread on a CPU backend
auto workDiv = make_workdiv<Acc1D>(groups, items);

alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size());
}
Expand Down

0 comments on commit 980f9a3

Please sign in to comment.