quokka-astro · BenWibking · Oct 1, 2022 · Sep 28, 2022 · Sep 28, 2022 · Sep 28, 2022
@@ -84,10 +84,16 @@ to the CMake command-line options (or change the `QUOKKA_PYTHON` option to `OFF`
 ## Running on GPUs
 By default, Quokka compiles itself to run only on CPUs. (If you want to run on NVIDIA GPUs, re-build Quokka as shown below. **(Warning: CUDA 11.6 generates invalid device code; see issue [21](https://github.com/BenWibking/quokka/issues/21). Use CUDA <= 11.5 instead.)**
 ```
-cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA
+cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA -DAMREX_GPUS_PER_NODE=N
 make -j6
 ```
-The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory. Please note that you must configure your compute cluster to run with 1 MPI rank per GPU in order for Quokka to work correctly. Quokka is only supported on Volta-class (V100) GPUs or newer.
+where $N$ is the number of GPUs available per compute node.
+
+It is necessary to use `-DAMREX_GPUS_PER_NODE` to specify the number of GPUs per compute node. Without this, performance will be very poor. All GPUs on a node must be visible from each MPI rank on the node for efficient GPU-aware MPI communication to take place via CUDA IPC. (When using the SLURM job scheduler, this means that `--gpu-bind` should be set to `none`.)
+
+The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory.
+
+Note that Quokka is only supported on Volta-class (V100) GPUs or newer.
 
 Note that 1D problems can run very slowly on GPUs due to a lack of sufficient parallelism. To run the test suite in a reasonable amount of time, you may wish to exclude the matter-energy exchange tests, e.g.:
 ```

@@ -5,6 +5,7 @@ set(CMAKE_C_COMPILER "gcc" CACHE PATH "")
 set(CMAKE_CXX_COMPILER "g++" CACHE PATH "")
 set(CMAKE_CUDA_COMPILER "nvcc" CACHE PATH "")
 set(AMReX_GPU_BACKEND CUDA CACHE STRING "")
+set(AMREX_GPUS_PER_NODE 4 CACHE STRING "")
 set(CMAKE_CUDA_ARCHITECTURES 80 CACHE STRING "")
 set(AMReX_ASCENT OFF CACHE BOOL "" FORCE)
 set(AMReX_CONDUIT OFF CACHE BOOL "" FORCE)

@@ -1,19 +1,28 @@
 #!/bin/bash --login
-#SBATCH --job-name="quokka_rt3d_test"
+#SBATCH --job-name="hydro3dblast"
 #SBATCH --partition=gpuA100x4
 #SBATCH --account=cvz-delta-gpu
-#SBATCH --mem=220G
-#SBATCH --time=00:30:00
+#SBATCH --mem=0
+#SBATCH --exclusive
+#SBATCH --time=0:10:00
 #SBATCH --constraint="scratch"
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=16
 #SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=closest,1
-#SBATCH --exclusive
+#SBATCH --gpu-bind=none
+
+export OMPI_MCA_btl_smcuda_use_cuda_ipc=1 # enable cuda_ipc
+export OMPI_MCA_btl_smcuda_cuda_ipc_verbose=100 # debugging
+
+module purge
+module load gcc/11.2.0
+module load cuda/11.7.0
+module load openmpi/4.1.4
 
 # run
-#srun ./build/src/RayleighTaylor3D/test_hydro3d_rt tests/RT3D.in
 srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in
 
 
+
+
@@ -0,0 +1,28 @@
+#!/bin/bash --login
+#SBATCH --job-name="hydro3dblast"
+#SBATCH --partition=gpuA100x4
+#SBATCH --account=cvz-delta-gpu
+#SBATCH --mem=0
+#SBATCH --exclusive
+#SBATCH --time=0:30:00
+#SBATCH --constraint="scratch"
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=16
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export OMPI_MCA_btl_smcuda_use_cuda_ipc=1 # enable cuda_ipc
+export OMPI_MCA_btl_smcuda_cuda_ipc_verbose=100 # debugging
+
+module purge
+module load gcc/11.2.0
+module load cuda/11.7.0
+module load openmpi/4.1.4
+
+# run
+srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in
+
+
+
+
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1 amrex.max_gpu_streams=1
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.max_gpu_streams=1
@@ -1,6 +1,4 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/Cooling/test_cooling tests/Cooling_1024.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1
-
+./build/src/Cooling/test_cooling tests/Cooling_1024.in
 
@@ -1,5 +1,4 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
 #hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_1024.in
 ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_1024.in
@@ -1,5 +1,4 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
 #hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_2048.in
 ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_2048.in
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
 ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1
-./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1 amrex.use_gpu_aware_mpi=0
+./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in
+
 
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in amrex.async_out=1
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in
@@ -1,4 +1,4 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in amrex.async_out=0
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in
+
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in amrex.async_out=1
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in
@@ -1,4 +1,3 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
-./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in amrex.async_out=1
+./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in
@@ -92,8 +92,8 @@ void AdvectionSimulation<SawtoothProblem>::computeReferenceSolution(
     std::vector<double> d(nx);
     std::vector<double> d_exact(nx);
     for (int i = 0; i < nx; ++i) {
-      amrex::Real rho = values.at(0).at(i);
-      amrex::Real rho_exact = val_exact.at(0).at(i);
+      amrex::Real rho = values.at(0)[i];
+      amrex::Real rho_exact = val_exact.at(0)[i];
       d.at(i) = rho;
       d_exact.at(i) = rho_exact;
     }

@@ -86,8 +86,8 @@ void AdvectionSimulation<SemiellipseProblem>::computeReferenceSolution(
     std::vector<double> d(nx);
     std::vector<double> d_exact(nx);
     for (int i = 0; i < nx; ++i) {
-      amrex::Real rho = values.at(0).at(i);
-      amrex::Real rho_exact = val_exact.at(0).at(i);
+      amrex::Real rho = values.at(0)[i];
+      amrex::Real rho_exact = val_exact.at(0)[i];
       d.at(i) = rho;
       d_exact.at(i) = rho_exact;
     }

@@ -19,9 +19,7 @@
 // internal headers
 
 #include "hydro_system.hpp"
-extern "C" {
-    #include "interpolate.h"
-}
+#include "interpolate.hpp"
 
 // function definitions
 

@@ -14,9 +14,7 @@
 
 // internal headers
 #include "hydro_system.hpp"
-extern "C" {
-    #include "interpolate.h"
-}
+#include "interpolate.hpp"
 
 // function definitions
 auto testproblem_hydro_sedov() -> int;

@@ -141,16 +141,16 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(
     std::vector<double> P_exact(nx);
 
     for (int i = 0; i < nx; ++i) {
-      amrex::Real const this_x = position.at(i);
+      amrex::Real const this_x = position[i];
       x.push_back(this_x);
 
       {
         const auto rho =
-            val_exact.at(HydroSystem<ContactProblem>::density_index).at(i);
+            val_exact.at(HydroSystem<ContactProblem>::density_index)[i];
         const auto xmom =
-            val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
+            val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
         const auto E =
-            val_exact.at(HydroSystem<ContactProblem>::energy_index).at(i);
+            val_exact.at(HydroSystem<ContactProblem>::energy_index)[i];
         const auto vx = xmom / rho;
         const auto Eint = E - 0.5 * rho * (vx * vx);
         const auto P = (HydroSystem<ContactProblem>::gamma_ - 1.) * Eint;
@@ -161,11 +161,11 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(
 
       {
         const auto frho =
-            values.at(HydroSystem<ContactProblem>::density_index).at(i);
+            values.at(HydroSystem<ContactProblem>::density_index)[i];
         const auto fxmom =
-            values.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
+            values.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
         const auto fE =
-            values.at(HydroSystem<ContactProblem>::energy_index).at(i);
+            values.at(HydroSystem<ContactProblem>::energy_index)[i];
         const auto fvx = fxmom / frho;
         const auto fEint = fE - 0.5 * frho * (fvx * fvx);
         const auto fP = (HydroSystem<ContactProblem>::gamma_ - 1.) * fEint;

@@ -19,9 +19,7 @@
 // internal headers
 
 #include "hydro_system.hpp"
-extern "C" {
-    #include "interpolate.h"
-}
+#include "interpolate.hpp"
 
 // function definitions
 

@@ -1,4 +1,4 @@
-add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.c)
+add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.cpp)
 
 if(AMReX_GPU_BACKEND MATCHES "CUDA")
     setup_target_for_cuda_compilation(test_hydro_highmach)

@@ -89,7 +89,7 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
   std::vector<double> x;
 
   for (int i = 0; i < nx; ++i) {
-    Real const this_x = position.at(i);
+    Real const this_x = position[i];
     x.push_back(this_x);
   }
 
@@ -124,38 +124,51 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
   }
 
   // interpolate density onto mesh
-  std::vector<double> d_interp(x.size());
+  amrex::Gpu::HostVector<double> d_interp(x.size());
   interpolate_arrays(x.data(), d_interp.data(), static_cast<int>(x.size()),
                      x_exact.data(), d_exact.data(),
                      static_cast<int>(x_exact.size()));
 
   // interpolate velocity onto mesh
-  std::vector<double> vx_interp(x.size());
+  amrex::Gpu::HostVector<double> vx_interp(x.size());
   interpolate_arrays(x.data(), vx_interp.data(), static_cast<int>(x.size()),
                      x_exact.data(), vx_exact.data(),
                      static_cast<int>(x_exact.size()));
 
   // interpolate pressure onto mesh
-  std::vector<double> P_interp(x.size());
+  amrex::Gpu::HostVector<double> P_interp(x.size());
   interpolate_arrays(x.data(), P_interp.data(), static_cast<int>(x.size()),
                      x_exact.data(), P_exact.data(),
                      static_cast<int>(x_exact.size()));
 
+  amrex::Gpu::DeviceVector<double> rho_g(d_interp.size());
+  amrex::Gpu::DeviceVector<double> vx_g(vx_interp.size());
+  amrex::Gpu::DeviceVector<double> P_g(P_interp.size());
+
+  // copy exact solution to device
+  amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, d_interp.begin(), d_interp.end(), rho_g.begin());
+  amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, vx_interp.begin(), vx_interp.end(), vx_g.begin());
+  amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, P_interp.begin(), P_interp.end(), P_g.begin());
+  amrex::Gpu::streamSynchronizeAll();
+
   // save reference solution
   const Real gamma = HydroSystem<HighMachProblem>::gamma_;
   for (amrex::MFIter iter(ref); iter.isValid(); ++iter) {
     const amrex::Box &indexRange = iter.validbox(); // excludes ghost zones
     auto const &state = ref.array(iter);
     auto const ncomp = ref.nComp();
+    auto const &rho_arr = rho_g.data();
+    auto const &vx_arr = vx_g.data();
+    auto const &P_arr = P_g.data();
 
-    amrex::LoopConcurrentOnCpu(indexRange, [=](int i, int j, int k) noexcept {
+    amrex::ParallelFor(indexRange, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
       for (int n = 0; n < ncomp; ++n) {
         state(i, j, k, n) = 0.;
       }
 
-      Real rho = d_interp.at(i);
-      Real vx = vx_interp.at(i);
-      Real Pgas = P_interp.at(i);
+      Real rho = rho_arr[i];
+      Real vx = vx_arr[i];
+      Real Pgas = P_arr[i];
       Real Eint = Pgas / (gamma - 1.);
       Real Etot = Eint + 0.5 * rho * (vx * vx);
 
@@ -176,11 +189,11 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
 
     for (int i = 0; i < nx; ++i) {
       const auto frho =
-          values.at(HydroSystem<HighMachProblem>::density_index).at(i);
+          values.at(HydroSystem<HighMachProblem>::density_index)[i];
       const auto fxmom =
-          values.at(HydroSystem<HighMachProblem>::x1Momentum_index).at(i);
+          values.at(HydroSystem<HighMachProblem>::x1Momentum_index)[i];
       const auto fE =
-          values.at(HydroSystem<HighMachProblem>::energy_index).at(i);
+          values.at(HydroSystem<HighMachProblem>::energy_index)[i];
       const auto fvx = fxmom / frho;
       const auto fEint = fE - 0.5 * frho * (fvx * fvx);
       const auto fP = (HydroSystem<HighMachProblem>::gamma_ - 1.) * fEint;

@@ -19,9 +19,8 @@
 // internal headers
 
 #include "hydro_system.hpp"
-extern "C" {
-    #include "interpolate.h"
-}
+#include "interpolate.hpp"
+
 
 // function definitions
 

@@ -18,9 +18,7 @@
 // internal headers
 
 #include "hydro_system.hpp"
-extern "C" {
-    #include "interpolate.h"
-}
+#include "interpolate.hpp"
 
 // function definitions