Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable gpu-aware MPI by default #121

Merged
merged 14 commits into from
Oct 1, 2022
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,16 @@ to the CMake command-line options (or change the `QUOKKA_PYTHON` option to `OFF`
## Running on GPUs
By default, Quokka compiles itself to run only on CPUs. (If you want to run on NVIDIA GPUs, re-build Quokka as shown below. **(Warning: CUDA 11.6 generates invalid device code; see issue [21](https://github.com/BenWibking/quokka/issues/21). Use CUDA <= 11.5 instead.)**
```
cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA
cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA -DAMREX_GPUS_PER_NODE=N
make -j6
```
The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory. Please note that you must configure your compute cluster to run with 1 MPI rank per GPU in order for Quokka to work correctly. Quokka is only supported on Volta-class (V100) GPUs or newer.
where $N$ is the number of GPUs available per compute node.

It is necessary to use `-DAMREX_GPUS_PER_NODE` to specify the number of GPUs per compute node. Without this, performance will be very poor. All GPUs on a node must be visible from each MPI rank on the node for efficient GPU-aware MPI communication to take place via CUDA IPC. (When using the SLURM job scheduler, this means that `--gpu-bind` should be set to `none`.)

The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory.

Note that Quokka is only supported on Volta-class (V100) GPUs or newer.

Note that 1D problems can run very slowly on GPUs due to a lack of sufficient parallelism. To run the test suite in a reasonable amount of time, you may wish to exclude the matter-energy exchange tests, e.g.:
```
Expand Down
1 change: 1 addition & 0 deletions cmake/delta_gcc_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(CMAKE_C_COMPILER "gcc" CACHE PATH "")
set(CMAKE_CXX_COMPILER "g++" CACHE PATH "")
set(CMAKE_CUDA_COMPILER "nvcc" CACHE PATH "")
set(AMReX_GPU_BACKEND CUDA CACHE STRING "")
set(AMREX_GPUS_PER_NODE 4 CACHE STRING "")
set(CMAKE_CUDA_ARCHITECTURES 80 CACHE STRING "")
set(AMReX_ASCENT OFF CACHE BOOL "" FORCE)
set(AMReX_CONDUIT OFF CACHE BOOL "" FORCE)
Expand Down
21 changes: 15 additions & 6 deletions scripts/delta-1gpu.submit → scripts/blast_1gpu.submit
Original file line number Diff line number Diff line change
@@ -1,19 +1,28 @@
#!/bin/bash --login
#SBATCH --job-name="quokka_rt3d_test"
#SBATCH --job-name="hydro3dblast"
#SBATCH --partition=gpuA100x4
#SBATCH --account=cvz-delta-gpu
#SBATCH --mem=220G
#SBATCH --time=00:30:00
#SBATCH --mem=0
#SBATCH --exclusive
#SBATCH --time=0:10:00
#SBATCH --constraint="scratch"
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=16
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=closest,1
#SBATCH --exclusive
#SBATCH --gpu-bind=none

export OMPI_MCA_btl_smcuda_use_cuda_ipc=1 # enable cuda_ipc
export OMPI_MCA_btl_smcuda_cuda_ipc_verbose=100 # debugging

module purge
module load gcc/11.2.0
module load cuda/11.7.0
module load openmpi/4.1.4

# run
#srun ./build/src/RayleighTaylor3D/test_hydro3d_rt tests/RT3D.in
srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in




28 changes: 28 additions & 0 deletions scripts/blast_1node.submit
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash --login
#SBATCH --job-name="hydro3dblast"
#SBATCH --partition=gpuA100x4
#SBATCH --account=cvz-delta-gpu
#SBATCH --mem=0
#SBATCH --exclusive
#SBATCH --time=0:30:00
#SBATCH --constraint="scratch"
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=16
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=none

export OMPI_MCA_btl_smcuda_use_cuda_ipc=1 # enable cuda_ipc
export OMPI_MCA_btl_smcuda_cuda_ipc_verbose=100 # debugging

module purge
module load gcc/11.2.0
module load cuda/11.7.0
module load openmpi/4.1.4

# run
srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in




23 changes: 0 additions & 23 deletions scripts/delta-1node.submit

This file was deleted.

3 changes: 1 addition & 2 deletions scripts/gpu_wrapper_amr_shell.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in
3 changes: 1 addition & 2 deletions scripts/gpu_wrapper_amr_shell_stream.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1 amrex.max_gpu_streams=1
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.max_gpu_streams=1
4 changes: 1 addition & 3 deletions scripts/gpu_wrapper_cooling_1024.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/Cooling/test_cooling tests/Cooling_1024.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1

./build/src/Cooling/test_cooling tests/Cooling_1024.in

1 change: 0 additions & 1 deletion scripts/gpu_wrapper_sedov_1024.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_1024.in
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_1024.in
1 change: 0 additions & 1 deletion scripts/gpu_wrapper_sedov_2048.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_2048.in
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_2048.in
1 change: 0 additions & 1 deletion scripts/gpu_wrapper_sedov_256.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in
6 changes: 0 additions & 6 deletions scripts/gpu_wrapper_sedov_256_gpuaware.sh

This file was deleted.

5 changes: 2 additions & 3 deletions scripts/gpu_wrapper_sedov_512.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1 amrex.use_gpu_aware_mpi=0
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in


3 changes: 1 addition & 2 deletions scripts/gpu_wrapper_shell_1024.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in amrex.async_out=1
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in
4 changes: 2 additions & 2 deletions scripts/gpu_wrapper_shell_2048.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in amrex.async_out=0
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in

3 changes: 1 addition & 2 deletions scripts/gpu_wrapper_shell_256.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in amrex.async_out=1
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in
3 changes: 1 addition & 2 deletions scripts/gpu_wrapper_shell_512.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in amrex.async_out=1
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in
4 changes: 2 additions & 2 deletions src/Advection/test_advection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ void AdvectionSimulation<SawtoothProblem>::computeReferenceSolution(
std::vector<double> d(nx);
std::vector<double> d_exact(nx);
for (int i = 0; i < nx; ++i) {
amrex::Real rho = values.at(0).at(i);
amrex::Real rho_exact = val_exact.at(0).at(i);
amrex::Real rho = values.at(0)[i];
amrex::Real rho_exact = val_exact.at(0)[i];
d.at(i) = rho;
d_exact.at(i) = rho_exact;
}
Expand Down
4 changes: 2 additions & 2 deletions src/AdvectionSemiellipse/test_advection_semiellipse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ void AdvectionSimulation<SemiellipseProblem>::computeReferenceSolution(
std::vector<double> d(nx);
std::vector<double> d_exact(nx);
for (int i = 0; i < nx; ++i) {
amrex::Real rho = values.at(0).at(i);
amrex::Real rho_exact = val_exact.at(0).at(i);
amrex::Real rho = values.at(0)[i];
amrex::Real rho_exact = val_exact.at(0)[i];
d.at(i) = rho;
d_exact.at(i) = rho_exact;
}
Expand Down
4 changes: 1 addition & 3 deletions src/HydroBlast2D/test_hydro2d_blast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@
// internal headers

#include "hydro_system.hpp"
extern "C" {
#include "interpolate.h"
}
#include "interpolate.hpp"

// function definitions

Expand Down
4 changes: 1 addition & 3 deletions src/HydroBlast3D/test_hydro3d_blast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@

// internal headers
#include "hydro_system.hpp"
extern "C" {
#include "interpolate.h"
}
#include "interpolate.hpp"

// function definitions
auto testproblem_hydro_sedov() -> int;
Expand Down
14 changes: 7 additions & 7 deletions src/HydroContact/test_hydro_contact.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,16 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(
std::vector<double> P_exact(nx);

for (int i = 0; i < nx; ++i) {
amrex::Real const this_x = position.at(i);
amrex::Real const this_x = position[i];
x.push_back(this_x);

{
const auto rho =
val_exact.at(HydroSystem<ContactProblem>::density_index).at(i);
val_exact.at(HydroSystem<ContactProblem>::density_index)[i];
const auto xmom =
val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
const auto E =
val_exact.at(HydroSystem<ContactProblem>::energy_index).at(i);
val_exact.at(HydroSystem<ContactProblem>::energy_index)[i];
const auto vx = xmom / rho;
const auto Eint = E - 0.5 * rho * (vx * vx);
const auto P = (HydroSystem<ContactProblem>::gamma_ - 1.) * Eint;
Expand All @@ -161,11 +161,11 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(

{
const auto frho =
values.at(HydroSystem<ContactProblem>::density_index).at(i);
values.at(HydroSystem<ContactProblem>::density_index)[i];
const auto fxmom =
values.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
values.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
const auto fE =
values.at(HydroSystem<ContactProblem>::energy_index).at(i);
values.at(HydroSystem<ContactProblem>::energy_index)[i];
const auto fvx = fxmom / frho;
const auto fEint = fE - 0.5 * frho * (fvx * fvx);
const auto fP = (HydroSystem<ContactProblem>::gamma_ - 1.) * fEint;
Expand Down
4 changes: 1 addition & 3 deletions src/HydroContact/test_hydro_contact.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@
// internal headers

#include "hydro_system.hpp"
extern "C" {
#include "interpolate.h"
}
#include "interpolate.hpp"

// function definitions

Expand Down
2 changes: 1 addition & 1 deletion src/HydroHighMach/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.c)
add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.cpp)

if(AMReX_GPU_BACKEND MATCHES "CUDA")
setup_target_for_cuda_compilation(test_hydro_highmach)
Expand Down
35 changes: 24 additions & 11 deletions src/HydroHighMach/test_hydro_highmach.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
std::vector<double> x;

for (int i = 0; i < nx; ++i) {
Real const this_x = position.at(i);
Real const this_x = position[i];
x.push_back(this_x);
}

Expand Down Expand Up @@ -124,38 +124,51 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
}

// interpolate density onto mesh
std::vector<double> d_interp(x.size());
amrex::Gpu::HostVector<double> d_interp(x.size());
interpolate_arrays(x.data(), d_interp.data(), static_cast<int>(x.size()),
x_exact.data(), d_exact.data(),
static_cast<int>(x_exact.size()));

// interpolate velocity onto mesh
std::vector<double> vx_interp(x.size());
amrex::Gpu::HostVector<double> vx_interp(x.size());
interpolate_arrays(x.data(), vx_interp.data(), static_cast<int>(x.size()),
x_exact.data(), vx_exact.data(),
static_cast<int>(x_exact.size()));

// interpolate pressure onto mesh
std::vector<double> P_interp(x.size());
amrex::Gpu::HostVector<double> P_interp(x.size());
interpolate_arrays(x.data(), P_interp.data(), static_cast<int>(x.size()),
x_exact.data(), P_exact.data(),
static_cast<int>(x_exact.size()));

amrex::Gpu::DeviceVector<double> rho_g(d_interp.size());
amrex::Gpu::DeviceVector<double> vx_g(vx_interp.size());
amrex::Gpu::DeviceVector<double> P_g(P_interp.size());

// copy exact solution to device
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, d_interp.begin(), d_interp.end(), rho_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, vx_interp.begin(), vx_interp.end(), vx_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, P_interp.begin(), P_interp.end(), P_g.begin());
amrex::Gpu::streamSynchronizeAll();

// save reference solution
const Real gamma = HydroSystem<HighMachProblem>::gamma_;
for (amrex::MFIter iter(ref); iter.isValid(); ++iter) {
const amrex::Box &indexRange = iter.validbox(); // excludes ghost zones
auto const &state = ref.array(iter);
auto const ncomp = ref.nComp();
auto const &rho_arr = rho_g.data();
auto const &vx_arr = vx_g.data();
auto const &P_arr = P_g.data();

amrex::LoopConcurrentOnCpu(indexRange, [=](int i, int j, int k) noexcept {
amrex::ParallelFor(indexRange, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
for (int n = 0; n < ncomp; ++n) {
state(i, j, k, n) = 0.;
}

Real rho = d_interp.at(i);
Real vx = vx_interp.at(i);
Real Pgas = P_interp.at(i);
Real rho = rho_arr[i];
Real vx = vx_arr[i];
Real Pgas = P_arr[i];
Real Eint = Pgas / (gamma - 1.);
Real Etot = Eint + 0.5 * rho * (vx * vx);

Expand All @@ -176,11 +189,11 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(

for (int i = 0; i < nx; ++i) {
const auto frho =
values.at(HydroSystem<HighMachProblem>::density_index).at(i);
values.at(HydroSystem<HighMachProblem>::density_index)[i];
const auto fxmom =
values.at(HydroSystem<HighMachProblem>::x1Momentum_index).at(i);
values.at(HydroSystem<HighMachProblem>::x1Momentum_index)[i];
const auto fE =
values.at(HydroSystem<HighMachProblem>::energy_index).at(i);
values.at(HydroSystem<HighMachProblem>::energy_index)[i];
const auto fvx = fxmom / frho;
const auto fEint = fE - 0.5 * frho * (fvx * fvx);
const auto fP = (HydroSystem<HighMachProblem>::gamma_ - 1.) * fEint;
Expand Down
5 changes: 2 additions & 3 deletions src/HydroHighMach/test_hydro_highmach.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
// internal headers

#include "hydro_system.hpp"
extern "C" {
#include "interpolate.h"
}
#include "interpolate.hpp"


// function definitions

Expand Down
4 changes: 1 addition & 3 deletions src/HydroKelvinHelmholz/test_hydro2d_kh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
// internal headers

#include "hydro_system.hpp"
extern "C" {
#include "interpolate.h"
}
#include "interpolate.hpp"

// function definitions

Expand Down
Loading