Merge branch 'feature.user.specific.shared.memory.splitting' into 'ma…

…ster.dev' Added new cmake option for PICLAS_SHARED_MEMORY for splitting the shared memory See merge request piclas/piclas!635
piclas-framework · Apr 20, 2022 · bdbd8de · bdbd8de
2 parents 6efed3e + 2c5dfca
commit bdbd8de
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 34 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -15,7 +15,12 @@ before_script:
       echo " PICLAS_SPLIT_TYPE=OMPI_COMM_TYPE_CORE. Splitting shared memory domains on processor-level!";
       export PICLAS_SPLIT_TYPE=OMPI_COMM_TYPE_CORE;
     else
-      echo "Splitting shared memory domains on node-level! Set variable DO_CORE_SPLIT=T to force core-level shared memory splitting for all regression tests.";
+      if [ -n "${DO_NODE_SPLIT}" ]; then
+        echo " PICLAS_SPLIT_TYPE=PICLAS_COMM_TYPE_NODE. Splitting shared memory domains on sub-node-level with 2 cores per node!";
+        export PICLAS_SPLIT_TYPE=PICLAS_COMM_TYPE_NODE;
+      else
+        echo "Splitting shared memory domains on node-level! Set variable DO_CORE_SPLIT=T to force core-level OR DO_NODE_SPLIT=T to force sub-node-level shared memory splitting for all regression tests.";
+      fi
     fi
 # ----------------------------------------------------------------------------------------------------------------------------------------------------
 # Stages

diff --git a/docs/documentation/userguide/workflow.md b/docs/documentation/userguide/workflow.md
@@ -40,6 +40,14 @@ the CMake configuration file for HDF5 (optional).
   output files into the VTK format
   * ``POSTI_USE_PARAVIEW``: Enables the compilation of the ParaView plugin, which enables the direct read-in of output files within ParaView
 
+* ``PICLAS_SHARED_MEMORY``: Split type for creating new communicators based on colors and keys (requires MPI 3 or higher).
+  Options with the prefix OMPI_ are specific to Open MPI.
+  * ``MPI_COMM_TYPE_SHARED``: creates one shared memory domain per physical node (default)
+  * ``OMPI_COMM_TYPE_CORE``:  creates one shared memory domain per MPI thread
+  * ``PICLAS_COMM_TYPE_NODE``: creates one shared memory domain per X numbers of MPI threads defined by ``PICLAS_SHARED_MEMORY_CORES``
+    * ``PICLAS_SHARED_MEMORY_CORES``: Number of MPI threads per virtual node (default is 2). Assumes that all MPI threads run on the
+      same physical node.
+
 (sec:solver-settings)=
 ## Solver settings
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -43,18 +43,34 @@ SET_PROPERTY(CACHE PICLAS_TIMEDISCMETHOD PROPERTY STRINGS Euler-Explicit
 # =========================================================================
 # Shared memory region splitting
 # =========================================================================
-# Get environment variable
+# Get environment variable, e.g. PICLAS_SPLIT_TYPE=OMPI_COMM_TYPE_CORE
 SET(PICLAS_SPLIT_TYPE "$ENV{PICLAS_SPLIT_TYPE}")
 IF("${PICLAS_SPLIT_TYPE}" STREQUAL "")
   SET(PICLAS_SPLIT_TYPE "MPI_COMM_TYPE_SHARED")
 ELSE()
-  IF("${PICLAS_SPLIT_TYPE}" STREQUAL "${PICLAS_SHARED_MEMORY}")
-    MESSAGE(STATUS "Using user-defined environment variable PICLAS_SPLIT_TYPE=${PICLAS_SPLIT_TYPE} for shared memory communicator splitting of variale PICLAS_SHARED_MEMORY")
-    ENDIF()
+  #IF("${PICLAS_SPLIT_TYPE}" STREQUAL "${PICLAS_SHARED_MEMORY}")
+  MESSAGE(STATUS "Using user-defined environment variable [PICLAS_SPLIT_TYPE = ${PICLAS_SPLIT_TYPE}] for shared memory communicator splitting. Setting [PICLAS_SHARED_MEMORY = ${PICLAS_SPLIT_TYPE}]")
+  #ENDIF()
 ENDIF()
 SET(PICLAS_SHARED_MEMORY "${PICLAS_SPLIT_TYPE}" CACHE STRING "Split type for creating new communicators based on colors and keys (requires MPI 3 or higher). Options with the prefix OMPI_ are specific to Open MPI.")
 SET_PROPERTY(CACHE PICLAS_SHARED_MEMORY PROPERTY STRINGS MPI_COMM_TYPE_SHARED
-                                                         OMPI_COMM_TYPE_CORE)
+                                                         OMPI_COMM_TYPE_CORE
+                                                         PICLAS_COMM_TYPE_NODE)
+
+ADD_DEFINITIONS(-DSharedMemoryMethod=${PICLAS_SHARED_MEMORY})
+IF(PICLAS_SHARED_MEMORY STREQUAL "MPI_COMM_TYPE_SHARED")
+  UNSET(PICLAS_SHARED_MEMORY_CORES CACHE)
+  ADD_DEFINITIONS(-DCORE_SPLIT=0)
+  MESSAGE(STATUS "Shared memory split type for subcommunicators set to node-level")
+ELSEIF(PICLAS_SHARED_MEMORY STREQUAL "PICLAS_COMM_TYPE_NODE")
+  SET(PICLAS_SHARED_MEMORY_CORES "2" CACHE STRING "Number of cores per node when setting PICLAS_SHARED_MEMORY=PICLAS_COMM_TYPE_NODE. All cores must be on the same physical node!")
+  ADD_DEFINITIONS(-DCORE_SPLIT=${PICLAS_SHARED_MEMORY_CORES})
+  MESSAGE(STATUS "Shared memory split type for subcommunicators set to sub-node-level with user-specific value [PICLAS_SHARED_MEMORY_CORES = ${PICLAS_SHARED_MEMORY_CORES}] cores per node")
+ELSEIF(PICLAS_SHARED_MEMORY STREQUAL "OMPI_COMM_TYPE_CORE")
+  UNSET(PICLAS_SHARED_MEMORY_CORES CACHE)
+  ADD_DEFINITIONS(-DCORE_SPLIT=1)
+  MESSAGE(STATUS "Shared memory split type for subcommunicators set to core-level")
+ENDIF()
 
 # =========================================================================
 # MISC
@@ -306,18 +322,6 @@ ELSE()
   ADD_DEFINITIONS(-DUSE_HDG=0)
 ENDIF(PICLAS_HDG)
 
-# =========================================================================
-# Shared memory split type
-# =========================================================================
-ADD_DEFINITIONS(-DSharedMemoryMethod=${PICLAS_SHARED_MEMORY})
-IF(PICLAS_SHARED_MEMORY STREQUAL "MPI_COMM_TYPE_SHARED")
-  ADD_DEFINITIONS(-DUSE_CORE_SPLIT=0)
-  MESSAGE(STATUS "Shared memory split type for subcommunicators set to node-level")
-ELSEIF(PICLAS_SHARED_MEMORY STREQUAL "OMPI_COMM_TYPE_CORE")
-  ADD_DEFINITIONS(-DUSE_CORE_SPLIT=1)
-  MESSAGE(STATUS "Shared memory split type for subcommunicators set to core-level")
-ENDIF()
-
 # ========================================================================
 # LOADBALANCE
 # =========================================================================

diff --git a/src/loadbalance/loaddistribution.f90 b/src/loadbalance/loaddistribution.f90
@@ -1279,11 +1279,11 @@ SUBROUTINE WriteElemTimeStatistics(WriteHeader,time_opt,iter_opt)
 
 ! Convert kB to GB
 memory=memory/1048576.
-#if USE_CORE_SPLIT
+#if ! (CORE_SPLIT==0)
   ! When core-level splitting is used, it is not clear how many cores are on the same physical compute node.
   ! Therefore, the values are set to -1.
   memory(2:3) = -1.
-#endif /*USE_CORE_SPLIT*/
+#endif /*! (CORE_SPLIT==0)*/
 
 ! Either create new file or add info to existing file
 !> create new file

diff --git a/src/mpi/mpi.f90 b/src/mpi/mpi.f90
@@ -143,8 +143,9 @@ SUBROUTINE InitMPIvars()
 USE MOD_Globals
 USE MOD_PreProc
 USE MOD_MPI_Vars
-USE MOD_Interpolation_Vars,ONLY:InterpolationInitIsDone
-USE MOD_Readintools,       ONLY:GETINT
+USE MOD_Interpolation_Vars ,ONLY: InterpolationInitIsDone
+USE MOD_Readintools        ,ONLY: GETINT
+USE MOD_MPI_Shared_Vars    ,ONLY: nProcessors_Global
 ! IMPLICIT VARIABLE HANDLING
 IMPLICIT NONE
 !-----------------------------------------------------------------------------------------------------------------------------------
@@ -155,11 +156,7 @@ SUBROUTINE InitMPIvars()
 ! LOCAL VARIABLES
 INTEGER :: color,groupsize
 !===================================================================================================================================
-IF(.NOT.InterpolationInitIsDone)THEN
-  CALL Abort(&
-      __STAMP__&
-      ,'InitMPITypes called before InitInterpolation')
-END IF
+IF(.NOT.InterpolationInitIsDone) CALL Abort(__STAMP__,'InitMPITypes called before InitInterpolation')
 ALLOCATE(SendRequest_U(nNbProcs)     )
 ALLOCATE(SendRequest_U2(nNbProcs)     )
 ALLOCATE(SendRequest_GEO(nNbProcs)     )
@@ -198,12 +195,25 @@ SUBROUTINE InitMPIvars()
 GroupSize=GETINT('GroupSize','0')
 IF(GroupSize.LT.1)THEN ! group procs by node
   ! Split the node communicator (shared memory) from the global communicator on physical processor or node level
-#if USE_CORE_SPLIT
+#if (CORE_SPLIT==1)
   CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,myRank,0,MPI_COMM_NODE,iError)
-#else
+#elif (CORE_SPLIT==0)
   ! Note that using SharedMemoryMethod=OMPI_COMM_TYPE_CORE somehow does not work in every case (intel/amd processors)
   ! Also note that OMPI_COMM_TYPE_CORE is undefined when not using OpenMPI
   CALL MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,SharedMemoryMethod,0,MPI_INFO_NULL,MPI_COMM_NODE,IERROR)
+#else
+  ! Check if more nodes than procs are required or
+  ! if the resulting split would create unequal procs per node
+  IF((CORE_SPLIT.GE.nProcessors_Global).OR.(MOD(nProcessors_Global,CORE_SPLIT).GT.0))THEN
+    SWRITE (*,'(A,I0,A,I0,A,F0.2,A)') ' WARNING: Either more nodes than cores selected (nodes: ',CORE_SPLIT,', cores: ',&
+        nProcessors_Global,') OR unequal number of cores per node (=',REAL(nProcessors_Global)/REAL(CORE_SPLIT),&
+        '). Setting 1 core per node for MPI_COMM_NODE!'
+    color = myRank
+  ELSE
+    ! Group procs so that every CORE_SPLIT procs are in the same group
+    color = INT(REAL(myrank*CORE_SPLIT)/REAL(nProcessors_Global))+1
+  END IF ! (CORE_SPLIT.GE.nProcessors_Global).OR.(MOD().GT.0)
+  CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,color,0,MPI_COMM_NODE,iError)
 #endif
 ELSE ! use groupsize
   color=myRank/GroupSize
@@ -213,6 +223,14 @@ SUBROUTINE InitMPIvars()
 CALL MPI_COMM_SIZE(MPI_COMM_NODE,nLocalProcs,iError)
 MPILocalRoot=(myLocalRank.EQ.0)
 
+IF (nProcessors_Global.EQ.nLocalProcs) THEN
+  SWRITE(UNIT_stdOUt,'(A,I0,A,I0,A)') ' | Starting gathered I/O communication with ',nLocalProcs,' procs in ',1,' group'
+ELSE
+  SWRITE(UNIT_stdOUt,'(A,I0,A,I0,A,I0,A)') ' | Starting gathered I/O communication with ',nLocalProcs,' procs each in ',&
+                                                        nProcessors_Global/nLocalProcs,' groups for a total number of ',&
+                                                        nProcessors_Global,' procs'
+END IF
+
 ! now split global communicator into small group leaders and the others
 MPI_COMM_LEADERS=MPI_COMM_NULL
 MPI_COMM_WORKERS=MPI_COMM_NULL

diff --git a/src/mpi/mpi_shared.f90 b/src/mpi/mpi_shared.f90
@@ -126,12 +126,25 @@ SUBROUTINE InitMPIShared()
 nProcessors_Global = nProcessors
 
 ! Split the node communicator (shared memory) from the global communicator on physical processor or node level
-#if USE_CORE_SPLIT
+#if (CORE_SPLIT==1)
   CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,myRank,0,MPI_COMM_SHARED,iError)
-#else
+#elif (CORE_SPLIT==0)
   ! Note that using SharedMemoryMethod=OMPI_COMM_TYPE_CORE somehow does not work in every case (intel/amd processors)
   ! Also note that OMPI_COMM_TYPE_CORE is undefined when not using OpenMPI
   CALL MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,SharedMemoryMethod,0,MPI_INFO_NULL,MPI_COMM_SHARED,IERROR)
+#else
+  ! Check if more nodes than procs are required or
+  ! if the resulting split would create unequal procs per node
+  IF((CORE_SPLIT.GE.nProcessors_Global).OR.(MOD(nProcessors_Global,CORE_SPLIT).GT.0))THEN
+    SWRITE (*,'(A,I0,A,I0,A,F0.2,A)') ' WARNING: Either more nodes than cores selected (nodes: ',CORE_SPLIT,', cores: ',&
+        nProcessors_Global,') OR unequal number of cores per node (=',REAL(nProcessors_Global)/REAL(CORE_SPLIT),&
+        '). Setting 1 core per node for MPI_COMM_SHARED!'
+    color = myRank
+  ELSE    
+    ! Group procs so that every CORE_SPLIT procs are in the same group
+    color = INT(REAL(myrank*CORE_SPLIT)/REAL(nProcessors_Global))+1
+  END IF ! (CORE_SPLIT.GE.nProcessors_Global).OR.(MOD().GT.0)
+  CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,color,0,MPI_COMM_SHARED,iError)
 #endif
 
 ! Find my rank on the shared communicator, comm size and proc name
@@ -142,11 +155,12 @@ SUBROUTINE InitMPIShared()
 IF (MOD(nProcessors_Global,nComputeNodeProcessors).NE.0) &
   CALL ABORT(__STAMP__,'MPI shared communication currently only supported with equal procs per node!')
 
-IF (nProcessors_Global/nComputeNodeProcessors.EQ.1) THEN
+IF (nProcessors_Global.EQ.nComputeNodeProcessors) THEN
   SWRITE(UNIT_stdOUt,'(A,I0,A,I0,A)') ' | Starting shared communication with ',nComputeNodeProcessors,' procs on ',1,' node'
 ELSE
-  SWRITE(UNIT_stdOUt,'(A,I0,A,I0,A)') ' | Starting shared communication with ',nComputeNodeProcessors,' procs on ',         &
-                                                            nProcessors_Global/nComputeNodeProcessors,' nodes'
+  SWRITE(UNIT_stdOUt,'(A,I0,A,I0,A,I0,A)') ' | Starting shared communication with ',nComputeNodeProcessors,' procs on ',         &
+                                                         nProcessors_Global/nComputeNodeProcessors,' nodes for a total number of ',&
+                                                         nProcessors_Global,' procs'
 END IF
 
 ! Send rank of compute node root to all procs on shared comm