Plum ExecutionSpace template parameter through more classes

LLNL · Sep 12, 2024 · fa2cc1b · fa2cc1b
1 parent 364b46a
commit fa2cc1b
Show file tree

Hide file tree

Showing 13 changed files with 402 additions and 370 deletions.
diff --git a/src/serac/infrastructure/accelerator.hpp b/src/serac/infrastructure/accelerator.hpp
@@ -12,7 +12,6 @@
  */
 
 #pragma once
-#include "RAJA/RAJA.hpp"
 #if defined(__CUDACC__)
 #define SERAC_HOST_DEVICE __host__ __device__
 #define SERAC_HOST __host__
@@ -49,7 +48,7 @@
  */
 #define SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING
 #endif
-
+#include "RAJA/RAJA.hpp"
 #include <memory>
 
 #include "axom/core.hpp"
@@ -72,24 +71,44 @@ enum class ExecutionSpace
   Dynamic  // Corresponds to execution that can "legally" happen on either the host or device
 };
 
-#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+template <ExecutionSpace exec>
+struct EvaluationSpacePolicy;
 
+template <>
+struct EvaluationSpacePolicy<ExecutionSpace::CPU> {
+  using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+  /// @brief Alias for number of teams for GPU kernel launches.
+  using teams_e = RAJA::LoopPolicy<RAJA::seq_exec>;
+  /// @brief Alias for GPU kernel launch policy.
+  using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+  using forall_policy = RAJA::seq_exec;
+};
+
+#if defined(__CUDACC__)
+template <>
+struct EvaluationSpacePolicy<ExecutionSpace::GPU> {
+  using threads_x     = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+  using teams_e       = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+  using forall_policy = RAJA::cuda_exec<128>;
+};
+#endif
+// TODO(cuda): Delete these serac namespace scope type definitions in favor
+// of the above user-configurable execution policies.
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
 /// @brief Alias for parallel threads policy on GPU
 using threads_x     = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 using teams_e       = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
 using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
 using forall_policy = RAJA::cuda_exec<128>;
-
 #else
-
 /// @brief Alias for parallel threads policy on GPU.
 using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
 /// @brief Alias for number of teams for GPU kernel launches.
 using teams_e = RAJA::LoopPolicy<RAJA::seq_exec>;
 /// @brief Alias for GPU kernel launch policy.
 using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 using forall_policy = RAJA::seq_exec;
-
 #endif
 
 /**
@@ -115,7 +134,6 @@ SERAC_HOST_DEVICE void suppress_capture_warnings(T...)
 {
 }
 
-#ifdef SERAC_USE_UMPIRE
 /// @overload
 template <>
 struct execution_to_memory<ExecutionSpace::CPU> {
@@ -133,7 +151,6 @@ template <>
 struct execution_to_memory<ExecutionSpace::Dynamic> {
   static constexpr axom::MemorySpace value = axom::MemorySpace::Unified;
 };
-#endif
 
 /// @brief Helper template for @p execution_to_memory trait
 template <ExecutionSpace space>

diff --git a/src/serac/numerics/functional/CMakeLists.txt b/src/serac/numerics/functional/CMakeLists.txt
@@ -33,10 +33,10 @@ set(functional_headers
     tuple_tensor_dual_functions.hpp
     )
 
-set(functional_sources 
-    domain.cpp 
-    element_restriction.cpp 
-    geometric_factors.cpp 
+set(functional_sources
+    domain.cpp
+    element_restriction.cpp
+    geometric_factors.cpp
     quadrature_data.cpp)
 
 set(functional_detail_headers
@@ -66,7 +66,7 @@ blt_list_append(TO functional_headers ELEMENTS ${functional_cuda_headers} IF ENA
 
 blt_add_library(
       NAME        serac_functional
-      HEADERS     ${functional_headers} ${functional_detail_headers} 
+      HEADERS     ${functional_headers} ${functional_detail_headers}
       SOURCES     ${functional_sources}
       DEPENDS_ON  ${functional_depends}
       )

diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp
@@ -235,14 +235,6 @@ void evaluation_kernel_impl(trial_element_tuple_type trial_elements, test_elemen
             // variadic non-type parameter.
             [&ctx, t, J, x, u, trial_elements, qf, qpts_per_elem, rule, r, qf_state, elements, qf_derivatives,
              qf_inputs, qf_outputs, interpolate_result, update_state](uint32_t e) {
-              // (void)qf_state;
-              // (void)qf_derivatives;
-              // (void)update_state;
-              // (void)qpts_per_elem;
-              // (void)trial_elements;
-              // (void)qf_inputs;
-              // (void)interpolate_result;
-              // (void)u;
               detail::suppress_capture_warnings(qf_state, qf_derivatives, update_state, qpts_per_elem, trial_elements,
                                                 qf_inputs, interpolate_result, u);
 
@@ -478,6 +470,8 @@ void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,
 
           RAJA::RangeSegment x_range(0, nquad);
           RAJA::loop<threads_x>(ctx, x_range, [&derivatives, qf_derivatives, nquad, e](int q) {
+            detail::suppress_capture_warnings(nquad);
+            (void)nquad;
             if constexpr (is_QOI_2) {
               get<0>(derivatives[e](q)) = qf_derivatives[e * nquad + uint32_t(q)];
             }

diff --git a/src/serac/numerics/functional/element_restriction.cpp b/src/serac/numerics/functional/element_restriction.cpp
@@ -221,12 +221,8 @@ std::vector<Array2D<int>> geom_local_face_dofs(int p)
   return output;
 }
 
-#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
-axom::Array<DoF, 2, axom::MemorySpace::Device> GetElementRestriction(const mfem::FiniteElementSpace* fes,
-#else
 axom::Array<DoF, 2, axom::MemorySpace::Host> GetElementRestriction(const mfem::FiniteElementSpace* fes,
-#endif
-                                                                     mfem::Geometry::Type geom)
+                                                                   mfem::Geometry::Type            geom)
 {
   std::vector<DoF> elem_dofs{};
   mfem::Mesh*      mesh = fes->GetMesh();
@@ -434,6 +430,16 @@ uint64_t ElementRestriction::ESize() const { return esize; }
 
 uint64_t ElementRestriction::LSize() const { return lsize; }
 
+SERAC_DEVICE DoF ElementRestriction::GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component,
+                                                   uint64_t num_nodes_, uint64_t components_)
+{
+  if (ordering_ == mfem::Ordering::Type::byNODES) {
+    return DoF{component * num_nodes_ + node.index(), (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
+  } else {
+    return DoF{node.index() * components_ + component, (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
+  }
+}
+
 DoF ElementRestriction::GetVDof(DoF node, uint64_t component) const
 {
   if (ordering == mfem::Ordering::Type::byNODES) {
@@ -461,52 +467,6 @@ void ElementRestriction::GetElementVDofs(int i, DoF* vdofs) const
   }
 }
 
-void ElementRestriction::Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
-{
-  for (uint64_t i = 0; i < num_elements; i++) {
-    for (uint64_t c = 0; c < components; c++) {
-      for (uint64_t j = 0; j < nodes_per_elem; j++) {
-        uint64_t E_id  = (i * components + c) * nodes_per_elem + j;
-        uint64_t L_id  = GetVDof(dof_info(i, j), c).index();
-        auto     l_ptr = L_vector.Read();
-        auto     e_ptr = E_vector.ReadWrite();
-        // std::cout << "l vec space " << (int)(L_vector.GetMemory().GetMemoryType()) << std::endl;
-        // std::cout << "e vec space " << (int)(E_vector.GetMemory().GetMemoryType()) << std::endl;
-        // E_vector[int(E_id)] = L_vector[int(L_id)];
-        if (L_vector.GetMemory().GetMemoryType() != mfem::MemoryType::HOST) {
-          RAJA::forall<RAJA::cuda_exec<128>>(RAJA::RangeSegment(0, 1),
-                                             [=] SERAC_HOST_DEVICE(int idx) { e_ptr[int(E_id)] = l_ptr[int(L_id)]; });
-        } else {
-          E_vector[int(E_id)] = L_vector[int(L_id)];
-        }
-      }
-    }
-  }
-}
-
-void ElementRestriction::ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
-{
-  for (uint64_t i = 0; i < num_elements; i++) {
-    for (uint64_t c = 0; c < components; c++) {
-      for (uint64_t j = 0; j < nodes_per_elem; j++) {
-        uint64_t E_id = (i * components + c) * nodes_per_elem + j;
-        uint64_t L_id = GetVDof(dof_info(i, j), c).index();
-        // std::cout << E_vector[int(E_id)] << std::endl;
-        auto l_ptr = L_vector.ReadWrite();
-        auto e_ptr = E_vector.Read();
-        // std::cout << "l vec space " << (int)(L_vector.GetMemory().GetMemoryType()) << std::endl;
-        // std::cout << "e vec space " << (int)(E_vector.GetMemory().GetMemoryType()) << std::endl;
-        if (L_vector.GetMemory().GetMemoryType() != mfem::MemoryType::HOST) {
-          RAJA::forall<RAJA::cuda_exec<128>>(RAJA::RangeSegment(0, 1),
-                                             [=] SERAC_HOST_DEVICE(int idx) { l_ptr[int(L_id)] += e_ptr[int(E_id)]; });
-        } else {
-          L_vector[int(L_id)] += E_vector[int(E_id)];
-        }
-      }
-    }
-  }
-}
-
 ////////////////////////////////////////////////////////////////////////
 
 BlockElementRestriction::BlockElementRestriction(const mfem::FiniteElementSpace* fes)
@@ -562,18 +522,4 @@ mfem::Array<int> BlockElementRestriction::bOffsets() const
   return offsets;
 };
 
-void BlockElementRestriction::Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
-{
-  for (auto [geom, restriction] : restrictions) {
-    restriction.Gather(L_vector, E_block_vector.GetBlock(geom));
-  }
-}
-
-void BlockElementRestriction::ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
-{
-  for (auto [geom, restriction] : restrictions) {
-    restriction.ScatterAdd(E_block_vector.GetBlock(geom), L_vector);
-  }
-}
-
 }  // namespace serac
diff --git a/src/serac/numerics/functional/element_restriction.hpp b/src/serac/numerics/functional/element_restriction.hpp
@@ -5,6 +5,7 @@
 #include "mfem.hpp"
 #include "axom/core.hpp"
 #include "geometry.hpp"
+#include "serac/infrastructure/accelerator.hpp"
 
 inline bool isH1(const mfem::FiniteElementSpace& fes)
 {
@@ -55,29 +56,29 @@ struct DoF {
   uint64_t bits;
 
   /// default ctor
-  DoF() : bits{} {}
+  SERAC_HOST_DEVICE DoF() : bits{} {}
 
   /// copy ctor
-  DoF(const DoF& other) : bits{other.bits} {}
+  SERAC_HOST_DEVICE DoF(const DoF& other) : bits{other.bits} {}
 
   /// create a `DoF` from the given index, sign and orientation values
-  DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
+  SERAC_HOST_DEVICE DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
       : bits((sign & 0x1ULL << sign_shift) + ((orientation & 0x7ULL) << orientation_shift) + index)
   {
   }
 
   /// copy assignment operator
-  void operator=(const DoF& other) { bits = other.bits; }
+  SERAC_HOST_DEVICE void operator=(const DoF& other) { bits = other.bits; }
 
   /// get the sign field of this `DoF`
-  int sign() const { return (bits & sign_mask) ? -1 : 1; }
+  SERAC_HOST_DEVICE int sign() const { return (bits & sign_mask) ? -1 : 1; }
 
   /// get the orientation field of this `DoF`
-  uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
+  SERAC_HOST_DEVICE uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
 
   /// get the index field of this `DoF`
 
-  uint64_t index() const { return (bits & index_mask); }
+  SERAC_HOST_DEVICE uint64_t index() const { return (bits & index_mask); }
 };
 
 /// a small struct used to enable range-based for loops in `Array2D`
@@ -181,11 +182,68 @@ struct ElementRestriction {
 
   DoF GetVDof(DoF node, uint64_t component) const;
 
+  SERAC_HOST_DEVICE static DoF GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component,
+                                             uint64_t num_nodes_, uint64_t components_);
+
   /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
-  void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const;
+  template <ExecutionSpace exec = ExecutionSpace::CPU>
+  void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
+  {
+    auto l_ptr = L_vector.Read();
+    auto e_ptr = E_vector.ReadWrite();
+
+    uint64_t                           d_components     = components;
+    uint64_t                           d_nodes_per_elem = nodes_per_elem;
+    uint64_t                           d_num_nodes      = num_nodes;
+    auto                               d_ordering       = ordering;
+    constexpr axom::MemorySpace        mem_space        = detail::execution_to_memory<exec>::value;
+    axom::Array<DoF, 2, mem_space>     tmp              = dof_info;
+    axom::ArrayView<DoF, 2, mem_space> d_dof_info       = tmp;
+
+    RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_policy>(
+        RAJA::TypedRangeSegment<uint64_t>(0, num_elements), [d_components, d_nodes_per_elem, d_num_nodes, d_ordering,
+                                                             e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t i) {
+          for (uint64_t c = 0; c < d_components; c++) {
+            for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
+              uint64_t E_id    = (i * d_components + c) * d_nodes_per_elem + j;
+              uint64_t L_id    = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
+              e_ptr[int(E_id)] = l_ptr[int(L_id)];
+            }
+          }
+        });
+    std::cout << "exiting\n";
+  }
 
   /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
-  void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const;
+  template <ExecutionSpace exec>
+  void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
+  {
+    auto l_ptr = L_vector.ReadWrite(true);
+    auto e_ptr = E_vector.Read(true);
+
+    uint64_t                           d_components     = components;
+    uint64_t                           d_nodes_per_elem = nodes_per_elem;
+    uint64_t                           d_num_nodes      = num_nodes;
+    uint64_t                           d_num_elements   = num_elements;
+    auto                               d_ordering       = ordering;
+    constexpr axom::MemorySpace        mem_space        = detail::execution_to_memory<exec>::value;
+    axom::Array<DoF, 2, mem_space>     tmp              = dof_info;
+    axom::ArrayView<DoF, 2, mem_space> d_dof_info       = tmp;
+
+    RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_policy>(
+        RAJA::TypedRangeSegment<uint64_t>(0, 1), [d_components, d_nodes_per_elem, d_num_elements, d_num_nodes,
+                                                  d_ordering, e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t) {
+          for (uint64_t i = 0; i < d_num_elements; ++i) {
+            for (uint64_t c = 0; c < d_components; c++) {
+              for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
+                uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j;
+                uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
+                l_ptr[int(L_id)] += e_ptr[int(E_id)];
+              }
+            }
+          }
+        });
+  }
 
   /// the size of the "E-vector"
   uint64_t esize;
@@ -205,12 +263,8 @@ struct ElementRestriction {
   /// the number of nodes in each element
   uint64_t nodes_per_elem;
 
-/// a 2D array (num_elements-by-nodes_per_elem) holding the dof info extracted from the finite element space
-#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
-  axom::Array<DoF, 2, axom::MemorySpace::Device> dof_info;
-#else
+  /// a 2D array (num_elements-by-nodes_per_elem) holding the dof info extracted from the finite element space
   axom::Array<DoF, 2, axom::MemorySpace::Host> dof_info;
-#endif
 
   /// whether the underlying dofs are arranged "byNodes" or "byVDim"
   mfem::Ordering::Type ordering;
@@ -221,6 +275,7 @@ struct ElementRestriction {
  * Instead of doing the "E->L" (gather) and "L->E" (scatter) operations for only one element geometry, this
  * class does them with block "E-vectors", where each element geometry is a separate block.
  */
+// template<ExecutionSpace exec = ExecutionSpace::CPU>
 struct BlockElementRestriction {
   /// default ctor leaves this object uninitialized
   BlockElementRestriction() {}
@@ -241,10 +296,22 @@ struct BlockElementRestriction {
   mfem::Array<int> bOffsets() const;
 
   /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
-  void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const;
+  template <ExecutionSpace exec>
+  void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
+  {
+    for (auto [geom, restriction] : restrictions) {
+      restriction.Gather<exec>(L_vector, E_block_vector.GetBlock(geom));
+    }
+  }
 
   /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
-  void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const;
+  template <ExecutionSpace exec>
+  void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
+  {
+    for (auto [geom, restriction] : restrictions) {
+      restriction.ScatterAdd<exec>(E_block_vector.GetBlock(geom), L_vector);
+    }
+  }
 
   /// the individual ElementRestriction operators for each element geometry
   std::map<mfem::Geometry::Type, ElementRestriction> restrictions;