Add experimental multi-GPU support

espressomd · Jun 4, 2024 · 7d0dbe0 · 7d0dbe0
1 parent 26235a6
commit 7d0dbe0
Show file tree

Hide file tree

Showing 10 changed files with 40 additions and 3 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -612,7 +612,7 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
     if(CMAKE_VERSION VERSION_LESS 3.25 OR NOT ESPRESSO_CUDA_COMPILER STREQUAL
                                           "clang")
       if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(CMAKE_CUDA_ARCHITECTURES 75)
+        message(FATAL_ERROR "variable CMAKE_CUDA_ARCHITECTURES is undefined")
       endif()
     endif()
   endif()

diff --git a/doc/sphinx/lb.rst b/doc/sphinx/lb.rst
@@ -387,6 +387,16 @@ of the LBM in analogy to the example for the CPU given in section
     system.lb = lbf
     system.integrator.run(100)
 
+The waLBerla library supports multi-GPU simulations.
+Without a suitable CUDA-aware MPI library, multi-GPU simulations are slower
+than single-GPU simulations, and would only be relevant for LB systems that
+are too large to fit in the memory of a single GPU device.
+Multi-GPU support in |es| is an experimental feature whose API may change at any time.
+It can be activated by invoking the following expression before the creation
+of the first LB GPU instance::
+
+    system.cuda_init_handle.call_method("set_device_id_per_rank")
+
 .. _Electrohydrodynamics:
 
 Electrohydrodynamics

diff --git a/src/core/cuda/common_cuda.cu b/src/core/cuda/common_cuda.cu
@@ -21,6 +21,9 @@
 
 #include "utils.cuh"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 #include <cstdio>
 
 cudaStream_t stream[1];

diff --git a/src/core/cuda/init_cuda.cu b/src/core/cuda/init_cuda.cu
@@ -17,13 +17,14 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <cuda.h>
-
 #include "init.hpp"
 #include "utils.cuh"
 
 #include <utils/constants.hpp>
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 #include <cstring>
 #include <string>
 

diff --git a/src/core/cuda/utils.cuh b/src/core/cuda/utils.cuh
@@ -26,6 +26,7 @@
 #include "utils.hpp"
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 
 #include <string>
 

diff --git a/src/script_interface/system/CudaInitHandle.cpp b/src/script_interface/system/CudaInitHandle.cpp
@@ -24,6 +24,10 @@
 #include "core/cuda/init.hpp"
 #include "core/cuda/utils.hpp"
 
+#if defined(CUDA) && defined(WALBERLA)
+#include "walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp"
+#endif
+
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -100,6 +104,14 @@ Variant CudaInitHandle::do_call_method(std::string const &name,
 #endif // CUDA
     return n_gpus;
   }
+#if defined(CUDA) && defined(WALBERLA)
+  if (name == "set_device_id_per_rank") {
+    if (cuda_get_n_gpus()) {
+      set_device_id_per_rank();
+    }
+    return {};
+  }
+#endif
   return {};
 }
 

diff --git a/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp b/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp
@@ -31,3 +31,5 @@ new_lb_walberla_cpu(std::shared_ptr<LatticeWalberla> const &lattice,
 std::shared_ptr<LBWalberlaBase>
 new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
                     double viscosity, double density, bool single_precision);
+
+void set_device_id_per_rank();
diff --git a/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu b/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu
@@ -44,6 +44,8 @@
 #include <walberla_bridge/LatticeWalberla.hpp>
 #include <walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp>
 
+#include <gpu/DeviceSelectMPI.h>
+
 #include <memory>
 
 std::shared_ptr<LBWalberlaBase>
@@ -56,3 +58,5 @@ new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
   return std::make_shared<walberla::LBWalberlaImpl<double, lbmpy::Arch::GPU>>(
       lattice, viscosity, density);
 }
+
+void set_device_id_per_rank() { walberla::gpu::selectDeviceBasedOnMpiRank(); }
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
@@ -52,6 +52,8 @@ class LBTest:
     system.periodicity = [True, True, True]
     system.time_step = params['tau']
     system.cell_system.skin = 1.0
+    if espressomd.gpu_available():
+        system.cuda_init_handle.call_method("set_device_id_per_rank")
     interpolation = False
 
     def setUp(self):

diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py
@@ -38,6 +38,8 @@ class TestCommon:
     system = espressomd.System(box_l=[16.0, 1.0, 1.0])
     system.time_step = TIME_STEP
     system.cell_system.skin = 0.4 * AGRID
+    if espressomd.gpu_available():
+        system.cuda_init_handle.call_method("set_device_id_per_rank")
     n_nodes = system.cell_system.get_state()["n_nodes"]
 
     def setUp(self):