Skip to content

Commit

Permalink
Plum ExecutionSpace template parameter through more classes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbowen42 committed Sep 12, 2024
1 parent 364b46a commit fa2cc1b
Show file tree
Hide file tree
Showing 13 changed files with 402 additions and 370 deletions.
33 changes: 25 additions & 8 deletions src/serac/infrastructure/accelerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
*/

#pragma once
#include "RAJA/RAJA.hpp"
#if defined(__CUDACC__)
#define SERAC_HOST_DEVICE __host__ __device__
#define SERAC_HOST __host__
Expand Down Expand Up @@ -49,7 +48,7 @@
*/
#define SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING
#endif

#include "RAJA/RAJA.hpp"
#include <memory>

#include "axom/core.hpp"
Expand All @@ -72,24 +71,44 @@ enum class ExecutionSpace
Dynamic // Corresponds to execution that can "legally" happen on either the host or device
};

#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
template <ExecutionSpace exec>
struct EvaluationSpacePolicy;

template <>
struct EvaluationSpacePolicy<ExecutionSpace::CPU> {
using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
/// @brief Alias for number of teams for GPU kernel launches.
using teams_e = RAJA::LoopPolicy<RAJA::seq_exec>;
/// @brief Alias for GPU kernel launch policy.
using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
using forall_policy = RAJA::seq_exec;
};

#if defined(__CUDACC__)
template <>
struct EvaluationSpacePolicy<ExecutionSpace::GPU> {
using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
using teams_e = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
using forall_policy = RAJA::cuda_exec<128>;
};
#endif
// TODO(cuda): Delete these serac namespace scope type definitions in favor
// of the above user-configurable execution policies.
#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
/// @brief Alias for parallel threads policy on GPU
using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
using teams_e = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
using forall_policy = RAJA::cuda_exec<128>;

#else

/// @brief Alias for parallel threads policy on GPU.
using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
/// @brief Alias for number of teams for GPU kernel launches.
using teams_e = RAJA::LoopPolicy<RAJA::seq_exec>;
/// @brief Alias for GPU kernel launch policy.
using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
using forall_policy = RAJA::seq_exec;

#endif

/**
Expand All @@ -115,7 +134,6 @@ SERAC_HOST_DEVICE void suppress_capture_warnings(T...)
{
}

#ifdef SERAC_USE_UMPIRE
/// @overload
template <>
struct execution_to_memory<ExecutionSpace::CPU> {
Expand All @@ -133,7 +151,6 @@ template <>
struct execution_to_memory<ExecutionSpace::Dynamic> {
static constexpr axom::MemorySpace value = axom::MemorySpace::Unified;
};
#endif

/// @brief Helper template for @p execution_to_memory trait
template <ExecutionSpace space>
Expand Down
10 changes: 5 additions & 5 deletions src/serac/numerics/functional/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ set(functional_headers
tuple_tensor_dual_functions.hpp
)

set(functional_sources
domain.cpp
element_restriction.cpp
geometric_factors.cpp
set(functional_sources
domain.cpp
element_restriction.cpp
geometric_factors.cpp
quadrature_data.cpp)

set(functional_detail_headers
Expand Down Expand Up @@ -66,7 +66,7 @@ blt_list_append(TO functional_headers ELEMENTS ${functional_cuda_headers} IF ENA

blt_add_library(
NAME serac_functional
HEADERS ${functional_headers} ${functional_detail_headers}
HEADERS ${functional_headers} ${functional_detail_headers}
SOURCES ${functional_sources}
DEPENDS_ON ${functional_depends}
)
Expand Down
10 changes: 2 additions & 8 deletions src/serac/numerics/functional/domain_integral_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,6 @@ void evaluation_kernel_impl(trial_element_tuple_type trial_elements, test_elemen
// variadic non-type parameter.
[&ctx, t, J, x, u, trial_elements, qf, qpts_per_elem, rule, r, qf_state, elements, qf_derivatives,
qf_inputs, qf_outputs, interpolate_result, update_state](uint32_t e) {
// (void)qf_state;
// (void)qf_derivatives;
// (void)update_state;
// (void)qpts_per_elem;
// (void)trial_elements;
// (void)qf_inputs;
// (void)interpolate_result;
// (void)u;
detail::suppress_capture_warnings(qf_state, qf_derivatives, update_state, qpts_per_elem, trial_elements,
qf_inputs, interpolate_result, u);

Expand Down Expand Up @@ -478,6 +470,8 @@ void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,

RAJA::RangeSegment x_range(0, nquad);
RAJA::loop<threads_x>(ctx, x_range, [&derivatives, qf_derivatives, nquad, e](int q) {
detail::suppress_capture_warnings(nquad);
(void)nquad;
if constexpr (is_QOI_2) {
get<0>(derivatives[e](q)) = qf_derivatives[e * nquad + uint32_t(q)];
}
Expand Down
76 changes: 11 additions & 65 deletions src/serac/numerics/functional/element_restriction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,8 @@ std::vector<Array2D<int>> geom_local_face_dofs(int p)
return output;
}

#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
axom::Array<DoF, 2, axom::MemorySpace::Device> GetElementRestriction(const mfem::FiniteElementSpace* fes,
#else
axom::Array<DoF, 2, axom::MemorySpace::Host> GetElementRestriction(const mfem::FiniteElementSpace* fes,
#endif
mfem::Geometry::Type geom)
mfem::Geometry::Type geom)
{
std::vector<DoF> elem_dofs{};
mfem::Mesh* mesh = fes->GetMesh();
Expand Down Expand Up @@ -434,6 +430,16 @@ uint64_t ElementRestriction::ESize() const { return esize; }

uint64_t ElementRestriction::LSize() const { return lsize; }

SERAC_DEVICE DoF ElementRestriction::GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component,
uint64_t num_nodes_, uint64_t components_)
{
if (ordering_ == mfem::Ordering::Type::byNODES) {
return DoF{component * num_nodes_ + node.index(), (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
} else {
return DoF{node.index() * components_ + component, (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
}
}

DoF ElementRestriction::GetVDof(DoF node, uint64_t component) const
{
if (ordering == mfem::Ordering::Type::byNODES) {
Expand Down Expand Up @@ -461,52 +467,6 @@ void ElementRestriction::GetElementVDofs(int i, DoF* vdofs) const
}
}

void ElementRestriction::Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
{
for (uint64_t i = 0; i < num_elements; i++) {
for (uint64_t c = 0; c < components; c++) {
for (uint64_t j = 0; j < nodes_per_elem; j++) {
uint64_t E_id = (i * components + c) * nodes_per_elem + j;
uint64_t L_id = GetVDof(dof_info(i, j), c).index();
auto l_ptr = L_vector.Read();
auto e_ptr = E_vector.ReadWrite();
// std::cout << "l vec space " << (int)(L_vector.GetMemory().GetMemoryType()) << std::endl;
// std::cout << "e vec space " << (int)(E_vector.GetMemory().GetMemoryType()) << std::endl;
// E_vector[int(E_id)] = L_vector[int(L_id)];
if (L_vector.GetMemory().GetMemoryType() != mfem::MemoryType::HOST) {
RAJA::forall<RAJA::cuda_exec<128>>(RAJA::RangeSegment(0, 1),
[=] SERAC_HOST_DEVICE(int idx) { e_ptr[int(E_id)] = l_ptr[int(L_id)]; });
} else {
E_vector[int(E_id)] = L_vector[int(L_id)];
}
}
}
}
}

void ElementRestriction::ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
{
for (uint64_t i = 0; i < num_elements; i++) {
for (uint64_t c = 0; c < components; c++) {
for (uint64_t j = 0; j < nodes_per_elem; j++) {
uint64_t E_id = (i * components + c) * nodes_per_elem + j;
uint64_t L_id = GetVDof(dof_info(i, j), c).index();
// std::cout << E_vector[int(E_id)] << std::endl;
auto l_ptr = L_vector.ReadWrite();
auto e_ptr = E_vector.Read();
// std::cout << "l vec space " << (int)(L_vector.GetMemory().GetMemoryType()) << std::endl;
// std::cout << "e vec space " << (int)(E_vector.GetMemory().GetMemoryType()) << std::endl;
if (L_vector.GetMemory().GetMemoryType() != mfem::MemoryType::HOST) {
RAJA::forall<RAJA::cuda_exec<128>>(RAJA::RangeSegment(0, 1),
[=] SERAC_HOST_DEVICE(int idx) { l_ptr[int(L_id)] += e_ptr[int(E_id)]; });
} else {
L_vector[int(L_id)] += E_vector[int(E_id)];
}
}
}
}
}

////////////////////////////////////////////////////////////////////////

BlockElementRestriction::BlockElementRestriction(const mfem::FiniteElementSpace* fes)
Expand Down Expand Up @@ -562,18 +522,4 @@ mfem::Array<int> BlockElementRestriction::bOffsets() const
return offsets;
};

void BlockElementRestriction::Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
{
for (auto [geom, restriction] : restrictions) {
restriction.Gather(L_vector, E_block_vector.GetBlock(geom));
}
}

void BlockElementRestriction::ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
{
for (auto [geom, restriction] : restrictions) {
restriction.ScatterAdd(E_block_vector.GetBlock(geom), L_vector);
}
}

} // namespace serac
99 changes: 83 additions & 16 deletions src/serac/numerics/functional/element_restriction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "mfem.hpp"
#include "axom/core.hpp"
#include "geometry.hpp"
#include "serac/infrastructure/accelerator.hpp"

inline bool isH1(const mfem::FiniteElementSpace& fes)
{
Expand Down Expand Up @@ -55,29 +56,29 @@ struct DoF {
uint64_t bits;

/// default ctor
DoF() : bits{} {}
SERAC_HOST_DEVICE DoF() : bits{} {}

/// copy ctor
DoF(const DoF& other) : bits{other.bits} {}
SERAC_HOST_DEVICE DoF(const DoF& other) : bits{other.bits} {}

/// create a `DoF` from the given index, sign and orientation values
DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
SERAC_HOST_DEVICE DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
: bits((sign & 0x1ULL << sign_shift) + ((orientation & 0x7ULL) << orientation_shift) + index)
{
}

/// copy assignment operator
void operator=(const DoF& other) { bits = other.bits; }
SERAC_HOST_DEVICE void operator=(const DoF& other) { bits = other.bits; }

/// get the sign field of this `DoF`
int sign() const { return (bits & sign_mask) ? -1 : 1; }
SERAC_HOST_DEVICE int sign() const { return (bits & sign_mask) ? -1 : 1; }

/// get the orientation field of this `DoF`
uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
SERAC_HOST_DEVICE uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }

/// get the index field of this `DoF`

uint64_t index() const { return (bits & index_mask); }
SERAC_HOST_DEVICE uint64_t index() const { return (bits & index_mask); }
};

/// a small struct used to enable range-based for loops in `Array2D`
Expand Down Expand Up @@ -181,11 +182,68 @@ struct ElementRestriction {

DoF GetVDof(DoF node, uint64_t component) const;

SERAC_HOST_DEVICE static DoF GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component,
uint64_t num_nodes_, uint64_t components_);

/// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const;
template <ExecutionSpace exec = ExecutionSpace::CPU>
void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
{
auto l_ptr = L_vector.Read();
auto e_ptr = E_vector.ReadWrite();

uint64_t d_components = components;
uint64_t d_nodes_per_elem = nodes_per_elem;
uint64_t d_num_nodes = num_nodes;
auto d_ordering = ordering;
constexpr axom::MemorySpace mem_space = detail::execution_to_memory<exec>::value;
axom::Array<DoF, 2, mem_space> tmp = dof_info;
axom::ArrayView<DoF, 2, mem_space> d_dof_info = tmp;

RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_policy>(
RAJA::TypedRangeSegment<uint64_t>(0, num_elements), [d_components, d_nodes_per_elem, d_num_nodes, d_ordering,
e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t i) {
for (uint64_t c = 0; c < d_components; c++) {
for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j;
uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
e_ptr[int(E_id)] = l_ptr[int(L_id)];
}
}
});
std::cout << "exiting\n";
}

/// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const;
template <ExecutionSpace exec>
void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
{
auto l_ptr = L_vector.ReadWrite(true);
auto e_ptr = E_vector.Read(true);

uint64_t d_components = components;
uint64_t d_nodes_per_elem = nodes_per_elem;
uint64_t d_num_nodes = num_nodes;
uint64_t d_num_elements = num_elements;
auto d_ordering = ordering;
constexpr axom::MemorySpace mem_space = detail::execution_to_memory<exec>::value;
axom::Array<DoF, 2, mem_space> tmp = dof_info;
axom::ArrayView<DoF, 2, mem_space> d_dof_info = tmp;

RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_policy>(
RAJA::TypedRangeSegment<uint64_t>(0, 1), [d_components, d_nodes_per_elem, d_num_elements, d_num_nodes,
d_ordering, e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t) {
for (uint64_t i = 0; i < d_num_elements; ++i) {
for (uint64_t c = 0; c < d_components; c++) {
for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j;
uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
l_ptr[int(L_id)] += e_ptr[int(E_id)];
}
}
}
});
}

/// the size of the "E-vector"
uint64_t esize;
Expand All @@ -205,12 +263,8 @@ struct ElementRestriction {
/// the number of nodes in each element
uint64_t nodes_per_elem;

/// a 2D array (num_elements-by-nodes_per_elem) holding the dof info extracted from the finite element space
#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
axom::Array<DoF, 2, axom::MemorySpace::Device> dof_info;
#else
/// a 2D array (num_elements-by-nodes_per_elem) holding the dof info extracted from the finite element space
axom::Array<DoF, 2, axom::MemorySpace::Host> dof_info;
#endif

/// whether the underlying dofs are arranged "byNodes" or "byVDim"
mfem::Ordering::Type ordering;
Expand All @@ -221,6 +275,7 @@ struct ElementRestriction {
* Instead of doing the "E->L" (gather) and "L->E" (scatter) operations for only one element geometry, this
* class does them with block "E-vectors", where each element geometry is a separate block.
*/
// template<ExecutionSpace exec = ExecutionSpace::CPU>
struct BlockElementRestriction {
/// default ctor leaves this object uninitialized
BlockElementRestriction() {}
Expand All @@ -241,10 +296,22 @@ struct BlockElementRestriction {
mfem::Array<int> bOffsets() const;

/// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const;
template <ExecutionSpace exec>
void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
{
for (auto [geom, restriction] : restrictions) {
restriction.Gather<exec>(L_vector, E_block_vector.GetBlock(geom));
}
}

/// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const;
template <ExecutionSpace exec>
void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
{
for (auto [geom, restriction] : restrictions) {
restriction.ScatterAdd<exec>(E_block_vector.GetBlock(geom), L_vector);
}
}

/// the individual ElementRestriction operators for each element geometry
std::map<mfem::Geometry::Type, ElementRestriction> restrictions;
Expand Down
Loading

0 comments on commit fa2cc1b

Please sign in to comment.