Skip to content

Commit

Permalink
Add experimental multi-GPU support
Browse files Browse the repository at this point in the history
  • Loading branch information
jngrad committed Jun 4, 2024
1 parent 26235a6 commit 7d0dbe0
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 3 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
if(CMAKE_VERSION VERSION_LESS 3.25 OR NOT ESPRESSO_CUDA_COMPILER STREQUAL
"clang")
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 75)
message(FATAL_ERROR "variable CMAKE_CUDA_ARCHITECTURES is undefined")
endif()
endif()
endif()
Expand Down
10 changes: 10 additions & 0 deletions doc/sphinx/lb.rst
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,16 @@ of the LBM in analogy to the example for the CPU given in section
system.lb = lbf
system.integrator.run(100)

The waLBerla library supports multi-GPU simulations.
Without a suitable CUDA-aware MPI library, multi-GPU simulations are slower
than single-GPU simulations, and would only be relevant for LB systems that
are too large to fit in the memory of a single GPU device.
Multi-GPU support in |es| is an experimental feature whose API may change at any time.
It can be activated by invoking the following expression before the creation
of the first LB GPU instance::

system.cuda_init_handle.call_method("set_device_id_per_rank")

.. _Electrohydrodynamics:

Electrohydrodynamics
Expand Down
3 changes: 3 additions & 0 deletions src/core/cuda/common_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

#include "utils.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

#include <cstdio>

cudaStream_t stream[1];
Expand Down
5 changes: 3 additions & 2 deletions src/core/cuda/init_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <cuda.h>

#include "init.hpp"
#include "utils.cuh"

#include <utils/constants.hpp>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cstring>
#include <string>

Expand Down
1 change: 1 addition & 0 deletions src/core/cuda/utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "utils.hpp"

#include <cuda.h>
#include <cuda_runtime.h>

#include <string>

Expand Down
12 changes: 12 additions & 0 deletions src/script_interface/system/CudaInitHandle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
#include "core/cuda/init.hpp"
#include "core/cuda/utils.hpp"

#if defined(CUDA) && defined(WALBERLA)
#include "walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp"
#endif

#include <string>
#include <unordered_map>
#include <utility>
Expand Down Expand Up @@ -100,6 +104,14 @@ Variant CudaInitHandle::do_call_method(std::string const &name,
#endif // CUDA
return n_gpus;
}
#if defined(CUDA) && defined(WALBERLA)
if (name == "set_device_id_per_rank") {
if (cuda_get_n_gpus()) {
set_device_id_per_rank();
}
return {};
}
#endif
return {};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ new_lb_walberla_cpu(std::shared_ptr<LatticeWalberla> const &lattice,
std::shared_ptr<LBWalberlaBase>
new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
double viscosity, double density, bool single_precision);

void set_device_id_per_rank();
4 changes: 4 additions & 0 deletions src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include <walberla_bridge/LatticeWalberla.hpp>
#include <walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp>

#include <gpu/DeviceSelectMPI.h>

#include <memory>

std::shared_ptr<LBWalberlaBase>
Expand All @@ -56,3 +58,5 @@ new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
return std::make_shared<walberla::LBWalberlaImpl<double, lbmpy::Arch::GPU>>(
lattice, viscosity, density);
}

void set_device_id_per_rank() { walberla::gpu::selectDeviceBasedOnMpiRank(); }
2 changes: 2 additions & 0 deletions testsuite/python/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class LBTest:
system.periodicity = [True, True, True]
system.time_step = params['tau']
system.cell_system.skin = 1.0
if espressomd.gpu_available():
system.cuda_init_handle.call_method("set_device_id_per_rank")
interpolation = False

def setUp(self):
Expand Down
2 changes: 2 additions & 0 deletions testsuite/python/lb_boundary_ghost_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class TestCommon:
system = espressomd.System(box_l=[16.0, 1.0, 1.0])
system.time_step = TIME_STEP
system.cell_system.skin = 0.4 * AGRID
if espressomd.gpu_available():
system.cuda_init_handle.call_method("set_device_id_per_rank")
n_nodes = system.cell_system.get_state()["n_nodes"]

def setUp(self):
Expand Down

0 comments on commit 7d0dbe0

Please sign in to comment.