Skip to content

Commit

Permalink
Revert "enable gpu-aware MPI by default (#121)"
Browse files Browse the repository at this point in the history
This reverts commit 4250108.
  • Loading branch information
BenWibking committed Oct 3, 2022
1 parent 4250108 commit d3bc5e6
Show file tree
Hide file tree
Showing 86 changed files with 346 additions and 520 deletions.
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,10 @@ to the CMake command-line options (or change the `QUOKKA_PYTHON` option to `OFF`
## Running on GPUs
By default, Quokka compiles itself to run only on CPUs. (If you want to run on NVIDIA GPUs, re-build Quokka as shown below. **(Warning: CUDA 11.6 generates invalid device code; see issue [21](https://github.com/BenWibking/quokka/issues/21). Use CUDA <= 11.5 instead.)**
```
cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA -DAMREX_GPUS_PER_NODE=N
cmake .. -DCMAKE_BUILD_TYPE=Release -DAMReX_GPU_BACKEND=CUDA
make -j6
```
where $N$ is the number of GPUs available per compute node.

It is necessary to use `-DAMREX_GPUS_PER_NODE` to specify the number of GPUs per compute node. Without this, performance will be very poor. All GPUs on a node must be visible from each MPI rank on the node for efficient GPU-aware MPI communication to take place via CUDA IPC. (When using the SLURM job scheduler, this means that `--gpu-bind` should be set to `none`.)

The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory.

Note that Quokka is only supported on Volta-class (V100) GPUs or newer.
The compiled test problems are in the test problem subdirectories in `build/src/`. Example scripts for running Quokka on compute clusters are in the `scripts/` subdirectory. Please note that you must configure your compute cluster to run with 1 MPI rank per GPU in order for Quokka to work correctly. Quokka is only supported on Volta-class (V100) GPUs or newer.

Note that 1D problems can run very slowly on GPUs due to a lack of sufficient parallelism. To run the test suite in a reasonable amount of time, you may wish to exclude the matter-energy exchange tests, e.g.:
```
Expand Down
3 changes: 1 addition & 2 deletions scripts/blast_1gpu.submit
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ module load cuda/11.7.0
module load openmpi/4.1.4

# run
srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in

srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in amrex.use_gpu_aware_mpi=1 amrex.the_arena_is_managed=0



3 changes: 1 addition & 2 deletions scripts/blast_1node.submit
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ module load cuda/11.7.0
module load openmpi/4.1.4

# run
srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in

srun ./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in amrex.use_gpu_aware_mpi=1 amrex.the_arena_is_managed=0



3 changes: 2 additions & 1 deletion scripts/gpu_wrapper_amr_shell.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1
3 changes: 2 additions & 1 deletion scripts/gpu_wrapper_amr_shell_stream.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.max_gpu_streams=1
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_amr.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1 amrex.max_gpu_streams=1
4 changes: 3 additions & 1 deletion scripts/gpu_wrapper_cooling_1024.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash

./build/src/Cooling/test_cooling tests/Cooling_1024.in
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/Cooling/test_cooling tests/Cooling_1024.in amrex.async_out=1 amrex.abort_on_out_of_gpu_memory=1


1 change: 1 addition & 0 deletions scripts/gpu_wrapper_sedov_1024.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_1024.in
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_1024.in
1 change: 1 addition & 0 deletions scripts/gpu_wrapper_sedov_2048.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_2048.in
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_2048.in
1 change: 1 addition & 0 deletions scripts/gpu_wrapper_sedov_256.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_256.in
5 changes: 3 additions & 2 deletions scripts/gpu_wrapper_sedov_512.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash

./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
#hpcrun -e gpu=nvidia --trace ./build/src/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1
./build/src/HydroBlast3D/test_hydro3d_blast tests/blast_unigrid_512.in amrex.async_out=1 amrex.use_gpu_aware_mpi=0

3 changes: 2 additions & 1 deletion scripts/gpu_wrapper_shell_1024.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_1024.in amrex.async_out=1
4 changes: 2 additions & 2 deletions scripts/gpu_wrapper_shell_2048.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in

export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_2048.in amrex.async_out=0
3 changes: 2 additions & 1 deletion scripts/gpu_wrapper_shell_256.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_256.in amrex.async_out=1
3 changes: 2 additions & 1 deletion scripts/gpu_wrapper_shell_512.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash

./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in
export CUDA_VISIBLE_DEVICES=$(($OMPI_COMM_WORLD_LOCAL_RANK % 4))
./build/src/RadhydroShell/test_radhydro3d_shell tests/radhydro_shell_512.in amrex.async_out=1
4 changes: 2 additions & 2 deletions src/Advection/test_advection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ void AdvectionSimulation<SawtoothProblem>::computeReferenceSolution(
std::vector<double> d(nx);
std::vector<double> d_exact(nx);
for (int i = 0; i < nx; ++i) {
amrex::Real rho = values.at(0)[i];
amrex::Real rho_exact = val_exact.at(0)[i];
amrex::Real rho = values.at(0).at(i);
amrex::Real rho_exact = val_exact.at(0).at(i);
d.at(i) = rho;
d_exact.at(i) = rho_exact;
}
Expand Down
4 changes: 2 additions & 2 deletions src/AdvectionSemiellipse/test_advection_semiellipse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ void AdvectionSimulation<SemiellipseProblem>::computeReferenceSolution(
std::vector<double> d(nx);
std::vector<double> d_exact(nx);
for (int i = 0; i < nx; ++i) {
amrex::Real rho = values.at(0)[i];
amrex::Real rho_exact = val_exact.at(0)[i];
amrex::Real rho = values.at(0).at(i);
amrex::Real rho_exact = val_exact.at(0).at(i);
d.at(i) = rho;
d_exact.at(i) = rho_exact;
}
Expand Down
4 changes: 3 additions & 1 deletion src/HydroBlast2D/test_hydro2d_blast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"
extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
4 changes: 3 additions & 1 deletion src/HydroBlast3D/test_hydro3d_blast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

// internal headers
#include "hydro_system.hpp"
#include "interpolate.hpp"
extern "C" {
#include "interpolate.h"
}

// function definitions
auto testproblem_hydro_sedov() -> int;
Expand Down
14 changes: 7 additions & 7 deletions src/HydroContact/test_hydro_contact.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,16 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(
std::vector<double> P_exact(nx);

for (int i = 0; i < nx; ++i) {
amrex::Real const this_x = position[i];
amrex::Real const this_x = position.at(i);
x.push_back(this_x);

{
const auto rho =
val_exact.at(HydroSystem<ContactProblem>::density_index)[i];
val_exact.at(HydroSystem<ContactProblem>::density_index).at(i);
const auto xmom =
val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
val_exact.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
const auto E =
val_exact.at(HydroSystem<ContactProblem>::energy_index)[i];
val_exact.at(HydroSystem<ContactProblem>::energy_index).at(i);
const auto vx = xmom / rho;
const auto Eint = E - 0.5 * rho * (vx * vx);
const auto P = (HydroSystem<ContactProblem>::gamma_ - 1.) * Eint;
Expand All @@ -157,11 +157,11 @@ void RadhydroSimulation<ContactProblem>::computeReferenceSolution(

{
const auto frho =
values.at(HydroSystem<ContactProblem>::density_index)[i];
values.at(HydroSystem<ContactProblem>::density_index).at(i);
const auto fxmom =
values.at(HydroSystem<ContactProblem>::x1Momentum_index)[i];
values.at(HydroSystem<ContactProblem>::x1Momentum_index).at(i);
const auto fE =
values.at(HydroSystem<ContactProblem>::energy_index)[i];
values.at(HydroSystem<ContactProblem>::energy_index).at(i);
const auto fvx = fxmom / frho;
const auto fEint = fE - 0.5 * frho * (fvx * fvx);
const auto fP = (HydroSystem<ContactProblem>::gamma_ - 1.) * fEint;
Expand Down
4 changes: 3 additions & 1 deletion src/HydroContact/test_hydro_contact.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"
extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
2 changes: 1 addition & 1 deletion src/HydroHighMach/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.cpp)
add_executable(test_hydro_highmach test_hydro_highmach.cpp ../main.cpp ../fextract.cpp ../interpolate.c)

if(AMReX_GPU_BACKEND MATCHES "CUDA")
setup_target_for_cuda_compilation(test_hydro_highmach)
Expand Down
35 changes: 11 additions & 24 deletions src/HydroHighMach/test_hydro_highmach.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
std::vector<double> x;

for (int i = 0; i < nx; ++i) {
Real const this_x = position[i];
Real const this_x = position.at(i);
x.push_back(this_x);
}

Expand Down Expand Up @@ -122,51 +122,38 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(
}

// interpolate density onto mesh
amrex::Gpu::HostVector<double> d_interp(x.size());
std::vector<double> d_interp(x.size());
interpolate_arrays(x.data(), d_interp.data(), static_cast<int>(x.size()),
x_exact.data(), d_exact.data(),
static_cast<int>(x_exact.size()));

// interpolate velocity onto mesh
amrex::Gpu::HostVector<double> vx_interp(x.size());
std::vector<double> vx_interp(x.size());
interpolate_arrays(x.data(), vx_interp.data(), static_cast<int>(x.size()),
x_exact.data(), vx_exact.data(),
static_cast<int>(x_exact.size()));

// interpolate pressure onto mesh
amrex::Gpu::HostVector<double> P_interp(x.size());
std::vector<double> P_interp(x.size());
interpolate_arrays(x.data(), P_interp.data(), static_cast<int>(x.size()),
x_exact.data(), P_exact.data(),
static_cast<int>(x_exact.size()));

amrex::Gpu::DeviceVector<double> rho_g(d_interp.size());
amrex::Gpu::DeviceVector<double> vx_g(vx_interp.size());
amrex::Gpu::DeviceVector<double> P_g(P_interp.size());

// copy exact solution to device
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, d_interp.begin(), d_interp.end(), rho_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, vx_interp.begin(), vx_interp.end(), vx_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, P_interp.begin(), P_interp.end(), P_g.begin());
amrex::Gpu::streamSynchronizeAll();

// save reference solution
const Real gamma = HydroSystem<HighMachProblem>::gamma_;
for (amrex::MFIter iter(ref); iter.isValid(); ++iter) {
const amrex::Box &indexRange = iter.validbox(); // excludes ghost zones
auto const &state = ref.array(iter);
auto const ncomp = ref.nComp();
auto const &rho_arr = rho_g.data();
auto const &vx_arr = vx_g.data();
auto const &P_arr = P_g.data();

amrex::ParallelFor(indexRange, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
amrex::LoopConcurrentOnCpu(indexRange, [=](int i, int j, int k) noexcept {
for (int n = 0; n < ncomp; ++n) {
state(i, j, k, n) = 0.;
}

Real rho = rho_arr[i];
Real vx = vx_arr[i];
Real Pgas = P_arr[i];
Real rho = d_interp.at(i);
Real vx = vx_interp.at(i);
Real Pgas = P_interp.at(i);
Real Eint = Pgas / (gamma - 1.);
Real Etot = Eint + 0.5 * rho * (vx * vx);

Expand All @@ -187,11 +174,11 @@ void RadhydroSimulation<HighMachProblem>::computeReferenceSolution(

for (int i = 0; i < nx; ++i) {
const auto frho =
values.at(HydroSystem<HighMachProblem>::density_index)[i];
values.at(HydroSystem<HighMachProblem>::density_index).at(i);
const auto fxmom =
values.at(HydroSystem<HighMachProblem>::x1Momentum_index)[i];
values.at(HydroSystem<HighMachProblem>::x1Momentum_index).at(i);
const auto fE =
values.at(HydroSystem<HighMachProblem>::energy_index)[i];
values.at(HydroSystem<HighMachProblem>::energy_index).at(i);
const auto fvx = fxmom / frho;
const auto fEint = fE - 0.5 * frho * (fvx * fvx);
const auto fP = (HydroSystem<HighMachProblem>::gamma_ - 1.) * fEint;
Expand Down
5 changes: 3 additions & 2 deletions src/HydroHighMach/test_hydro_highmach.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"

extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
4 changes: 3 additions & 1 deletion src/HydroKelvinHelmholz/test_hydro2d_kh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"
extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
2 changes: 1 addition & 1 deletion src/HydroLeblanc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_executable(test_hydro_leblanc ../main.cpp test_hydro_leblanc.cpp ../fextract.cpp ../interpolate.cpp)
add_executable(test_hydro_leblanc ../main.cpp test_hydro_leblanc.cpp ../fextract.cpp ../interpolate.c)

if(AMReX_GPU_BACKEND MATCHES "CUDA")
setup_target_for_cuda_compilation(test_hydro_leblanc)
Expand Down
27 changes: 7 additions & 20 deletions src/HydroLeblanc/test_hydro_leblanc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,32 +204,19 @@ void RadhydroSimulation<ShocktubeProblem>::computeReferenceSolution(
static_cast<int>(xs.size()), xs_exact.data(),
eint_exact.data(), static_cast<int>(xs_exact.size()));

amrex::Gpu::DeviceVector<double> rho_g(density_exact_interp.size());
amrex::Gpu::DeviceVector<double> vx_g(velocity_exact_interp.size());
amrex::Gpu::DeviceVector<double> P_g(pressure_exact_interp.size());

// copy exact solution to device
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, density_exact_interp.begin(), density_exact_interp.end(), rho_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, velocity_exact_interp.begin(), velocity_exact_interp.end(), vx_g.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, pressure_exact_interp.begin(), pressure_exact_interp.end(), P_g.begin());
amrex::Gpu::streamSynchronizeAll();

// fill reference solution multifab
for (amrex::MFIter iter(ref); iter.isValid(); ++iter) {
const amrex::Box &indexRange = iter.validbox();
auto const &stateExact = ref.array(iter);
auto const ncomp = ref.nComp();
auto const &rho_arr = rho_g.data();
auto const &vx_arr = vx_g.data();
auto const &P_arr = P_g.data();

amrex::ParallelFor(indexRange, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
amrex::LoopConcurrentOnCpu(indexRange, [=](int i, int j, int k) noexcept {
for (int n = 0; n < ncomp; ++n) {
stateExact(i, j, k, n) = 0.;
}
amrex::Real rho = rho_arr[i];
amrex::Real vx = vx_arr[i];
amrex::Real P = P_arr[i];
amrex::Real rho = density_exact_interp.at(i);
amrex::Real vx = velocity_exact_interp.at(i);
amrex::Real P = pressure_exact_interp.at(i);

const auto gamma = HydroSystem<ShocktubeProblem>::gamma_;
stateExact(i, j, k, HydroSystem<ShocktubeProblem>::density_index) = rho;
Expand Down Expand Up @@ -259,11 +246,11 @@ void RadhydroSimulation<ShocktubeProblem>::computeReferenceSolution(

for (int i = 0; i < nx; ++i) {
amrex::Real rho =
values.at(HydroSystem<ShocktubeProblem>::density_index)[i];
values.at(HydroSystem<ShocktubeProblem>::density_index).at(i);
amrex::Real xmom =
values.at(HydroSystem<ShocktubeProblem>::x1Momentum_index)[i];
values.at(HydroSystem<ShocktubeProblem>::x1Momentum_index).at(i);
amrex::Real Egas =
values.at(HydroSystem<ShocktubeProblem>::energy_index)[i];
values.at(HydroSystem<ShocktubeProblem>::energy_index).at(i);

amrex::Real xvel = xmom / rho;
amrex::Real Eint = Egas - xmom * xmom / (2.0 * rho);
Expand Down
5 changes: 3 additions & 2 deletions src/HydroLeblanc/test_hydro_leblanc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"

extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
4 changes: 3 additions & 1 deletion src/HydroQuirk/test_quirk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
// internal headers

#include "hydro_system.hpp"
#include "interpolate.hpp"
extern "C" {
#include "interpolate.h"
}

// function definitions

Expand Down
Loading

0 comments on commit d3bc5e6

Please sign in to comment.