From d575226914c9d27d90d5076d6284ffc7d558c27a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= <rasolca@cscs.ch>
Date: Mon, 23 Sep 2024 17:58:37 +0200
Subject: [PATCH] gpu to gh200

---
 ci/.gitlab-ci.yml                 | 36 +++++++++---------
 ci/common-ci.yml                  | 62 +++++++++++++++++++++++++++----
 ci/ctest_to_gitlab.sh             |  5 +--
 ci/cuda/gcc11_release.yml         | 31 ----------------
 ci/cuda/gcc13_release.yml         | 28 ++++++++++++++
 ci/docker/common-gh200.yaml       | 55 +++++++++++++++++++++++++++
 ci/docker/deploy.Dockerfile       |  3 +-
 ci/docker/release-cuda-gh200.yaml | 26 +++++++++++++
 ci/mpi-ctest                      | 17 ---------
 9 files changed, 185 insertions(+), 78 deletions(-)
 delete mode 100644 ci/cuda/gcc11_release.yml
 create mode 100644 ci/cuda/gcc13_release.yml
 create mode 100644 ci/docker/common-gh200.yaml
 create mode 100644 ci/docker/release-cuda-gh200.yaml

diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
index 0204a001ee..c8f6a96fba 100644
--- a/ci/.gitlab-ci.yml
+++ b/ci/.gitlab-ci.yml
@@ -1,19 +1,19 @@
 include:
-  - local: 'ci/cpu/asan_ubsan_lsan.yml'
-  - local: 'ci/cpu/clang15_release_cxx20.yml'
-  - local: 'ci/cpu/clang15_release_stdexec.yml'
-  - local: 'ci/cpu/clang15_release.yml'
-  - local: 'ci/cpu/clang16_release.yml'
-  - local: 'ci/cpu/clang18_release.yml'
-  - local: 'ci/cpu/gcc11_release_stdexec.yml'
-  - local: 'ci/cpu/gcc11_debug_stdexec.yml'
-  - local: 'ci/cpu/gcc12_release_cxx20.yml'
-  - local: 'ci/cpu/gcc13_codecov.yml'
-  - local: 'ci/cpu/gcc13_release.yml'
-  - local: 'ci/cuda/gcc11_release.yml'
-  - local: 'ci/cuda/gcc11_release_scalapack.yml'
-  - local: 'ci/cuda/gcc11_codecov.yml'
-  - local: 'ci/cuda/gcc11_debug_scalapack.yml'
-  - local: 'ci/cuda/gcc13_release_stdexec.yml'
-  - local: 'ci/rocm/clang14_release.yml'
-  - local: 'ci/rocm/clang14_release_stdexec.yml'
+#  - local: 'ci/cpu/asan_ubsan_lsan.yml'
+#  - local: 'ci/cpu/clang15_release_cxx20.yml'
+#  - local: 'ci/cpu/clang15_release_stdexec.yml'
+#  - local: 'ci/cpu/clang15_release.yml'
+#  - local: 'ci/cpu/clang16_release.yml'
+#  - local: 'ci/cpu/clang18_release.yml'
+#  - local: 'ci/cpu/gcc11_release_stdexec.yml'
+#  - local: 'ci/cpu/gcc11_debug_stdexec.yml'
+#  - local: 'ci/cpu/gcc12_release_cxx20.yml'
+#  - local: 'ci/cpu/gcc13_codecov.yml'
+#  - local: 'ci/cpu/gcc13_release.yml'
+  - local: 'ci/cuda/gcc13_release.yml'
+#  - local: 'ci/cuda/gcc11_release_scalapack.yml'
+#  - local: 'ci/cuda/gcc11_codecov.yml'
+#  - local: 'ci/cuda/gcc11_debug_scalapack.yml'
+#  - local: 'ci/cuda/gcc13_release_stdexec.yml'
+#  - local: 'ci/rocm/clang14_release.yml'
+#  - local: 'ci/rocm/clang14_release_stdexec.yml'
diff --git a/ci/common-ci.yml b/ci/common-ci.yml
index 396f03efac..d3eb156d67 100644
--- a/ci/common-ci.yml
+++ b/ci/common-ci.yml
@@ -9,12 +9,9 @@ stages:
 variables:
   FF_TIMESTAMPS: true
 
-##
-## BUILDS
-##
+## BUILD DEPS
 
-.build_deps_common:
-  extends: .container-builder
+.build_deps_common_base:
   stage: build_deps
   timeout: 6 hours
   before_script:
@@ -58,11 +55,26 @@ variables:
     EXTRA_APTGET: ""
     CXXSTD: 17
     USE_MKL: "OFF"
-    COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
     USE_CODECOV: "false"
 
-.build_common:
-  extends: .container-builder
+.build_deps_common:
+  extends:
+    - .container-builder-cscs-zen2
+    - .build_deps_common_base
+  variables:
+    COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
+
+.build_deps_common_gh200:
+  extends:
+    - .container-builder-cscs-gh200
+    - .build_deps_common_base
+  variables:
+    COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml
+    SLURM_RESERVATION: "daint"
+
+## BUILD DLAF
+
+.build_common_base:
   stage: build
   timeout: 2 hours
   before_script:
@@ -85,6 +97,22 @@ variables:
     paths:
       - pipeline.yml
 
+.build_common:
+  extends:
+    - .container-builder-cscs-zen2
+    - .build_common_base
+  variables:
+    LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"
+
+
+.build_common_gh200:
+  extends:
+    - .container-builder-cscs-gh200
+    - .build_common_base
+  variables:
+    LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"
+    SLURM_RESERVATION: "daint"
+
 .build_for_daint-mc:
   variables:
     RUNNER: ".container-runner-daint"
@@ -106,9 +134,27 @@ variables:
     THREADS_MAX_PER_TASK: 32
     THREADS_PER_NODE: 256
 
+.build_for_alps_gh200:
+  variables:
+    RUNNER: ".container-runner-todi-gh200"
+    SLURM_CONSTRAINT: gpu
+    # 64 / 2 to avoid ranks on multiple sockets for RANK6
+    THREADS_MAX_PER_TASK: 32
+    THREADS_PER_NODE: 256
+
+## RUN
+
 .run_common:
   stage: test
   trigger:
     strategy: depend
     forward:
       pipeline_variables: true
+
+.run_todi:
+  extends: .run_common
+  variables:
+    SLURM_RESERVATION: "daint"
+    SLURM_MPI: "pmi2"
+    # Workaround after update until hooks are fixed
+    ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn
diff --git a/ci/ctest_to_gitlab.sh b/ci/ctest_to_gitlab.sh
index 99b2e7a9f3..b19461494f 100755
--- a/ci/ctest_to_gitlab.sh
+++ b/ci/ctest_to_gitlab.sh
@@ -55,7 +55,6 @@ ARTIFACTS="
 "
 fi
 
-# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
 BASE_TEMPLATE="
 include:
   - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
@@ -70,7 +69,7 @@ variables:
   SLURM_EXCLUSIVE: ''
   SLURM_EXACT: ''
   SLURM_CONSTRAINT: $SLURM_CONSTRAINT
-  CRAY_CUDA_MPS: 0
+  CRAY_CUDA_MPS: 1
   MPICH_MAX_THREAD_SAFETY: multiple
 
 {{JOBS}}
@@ -104,7 +103,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do
         N=`echo "$rank_label" | sed "s/RANK_//"`
         C=$(( THREADS_PER_NODE / N ))
         if [ $C -gt $THREADS_MAX_PER_TASK ]; then
-        C=$THREADS_MAX_PER_TASK
+            C=$THREADS_MAX_PER_TASK
         fi
 
         # Skip label combinations that match no tests
diff --git a/ci/cuda/gcc11_release.yml b/ci/cuda/gcc11_release.yml
deleted file mode 100644
index 93799b4b66..0000000000
--- a/ci/cuda/gcc11_release.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-include:
-  - local: 'ci/common-ci.yml'
-
-cuda gcc11 release deps:
-  extends: .build_deps_common
-  variables:
-    BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04
-    COMPILER: gcc@11
-    SPACK_ENVIRONMENT: ci/docker/release-cuda.yaml
-    USE_MKL: "ON"
-    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deps
-
-cuda gcc11 release build:
-  extends:
-    - .build_common
-    - .build_for_daint-gpu
-  needs:
-    - cuda gcc11 release deps
-  variables:
-    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/dlaf:$CI_COMMIT_SHA
-
-cuda gcc11 release test:
-  extends: .run_common
-  variables:
-    PIKA_MPI_ENABLE_POOL: 1
-  needs:
-    - cuda gcc11 release build
-  trigger:
-    include:
-      - artifact: pipeline.yml
-        job: cuda gcc11 release build
diff --git a/ci/cuda/gcc13_release.yml b/ci/cuda/gcc13_release.yml
new file mode 100644
index 0000000000..18c340cdf9
--- /dev/null
+++ b/ci/cuda/gcc13_release.yml
@@ -0,0 +1,28 @@
+include:
+  - local: 'ci/common-ci.yml'
+
+cuda gcc13 release deps:
+  extends: .build_deps_common_gh200
+  variables:
+    BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04
+    COMPILER: gcc@13
+    SPACK_ENVIRONMENT: ci/docker/release-cuda-gh200.yaml
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/deps
+
+cuda gcc13 release build:
+  extends:
+    - .build_common_gh200
+    - .build_for_alps_gh200
+  needs:
+    - cuda gcc13 release deps
+  variables:
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/dlaf:$CI_COMMIT_SHA
+
+cuda gcc13 release test:
+  extends: .run_todi
+  needs:
+    - cuda gcc13 release build
+  trigger:
+    include:
+      - artifact: pipeline.yml
+        job: cuda gcc13 release build
diff --git a/ci/docker/common-gh200.yaml b/ci/docker/common-gh200.yaml
new file mode 100644
index 0000000000..a64798b84f
--- /dev/null
+++ b/ci/docker/common-gh200.yaml
@@ -0,0 +1,55 @@
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+packages:
+  all:
+    target: [neoverse_v2]
+  # Set nvpl as default blas, lapack and scalapack provider.
+  # Can be overwritten in environments if needed.
+  blas:
+    require: 'nvpl-blas'
+  lapack:
+    require: 'nvpl-lapack'
+  scalapack:
+    require: 'netlib-scalapack'
+  mpi:
+    require: 'mpich'
+  blaspp:
+    variants:
+      - '~cuda'
+      - '~openmp'
+      - '~rocm'
+  nvpl-blas:
+    require:
+      - 'threads=openmp'
+  nvpl-lapack:
+    require:
+      - 'threads=openmp'
+  openblas:
+    variants:
+      - 'threads=openmp'
+  mpich:
+    # Fix version to have better control.
+    require:
+      - '@3.4.3'
+    variants:
+      - '~fortran'
+      - '~libxml2'
+  libfabric:
+    # Fix version to be compatible with version on todi
+    require:
+      - '@1.15.2'
+  hwloc:
+    variants:
+      - '~libxml2'
+  git:
+    # Force git as non-buildable to allow deprecated versions in environments
+    # https://github.com/spack/spack/pull/30040
+    buildable: false
diff --git a/ci/docker/deploy.Dockerfile b/ci/docker/deploy.Dockerfile
index 42a11b2264..12e907276a 100644
--- a/ci/docker/deploy.Dockerfile
+++ b/ci/docker/deploy.Dockerfile
@@ -41,6 +41,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"
 
 # Automatically print stacktraces on segfault
-ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
+ARG LD_PRELOAD
+ENV LD_PRELOAD=${LD_PRELOAD}
 
 WORKDIR ${BUILD}
diff --git a/ci/docker/release-cuda-gh200.yaml b/ci/docker/release-cuda-gh200.yaml
new file mode 100644
index 0000000000..a7faf67ff2
--- /dev/null
+++ b/ci/docker/release-cuda-gh200.yaml
@@ -0,0 +1,26 @@
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+spack:
+  include:
+  - /spack_environment/common-gh200.yaml
+
+  view: false
+  concretizer:
+    unify:
+      true
+
+  specs:
+    - dla-future@master +cuda +miniapps +ci-test
+
+  packages:
+    all:
+      variants:
+        - 'build_type=Release'
diff --git a/ci/mpi-ctest b/ci/mpi-ctest
index 7eefed90f3..d8f7689e4a 100755
--- a/ci/mpi-ctest
+++ b/ci/mpi-ctest
@@ -11,19 +11,6 @@ CTEST_OUTPUT="$CI_PROJECT_DIR/output/ctest.$SLURM_PROCID.txt"
 
 pushd /DLA-Future-build > /dev/null
 
-export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
-export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
-
-if which nvidia-cuda-mps-control && [ $SLURM_LOCALID = 0 ]; then START_MPS=1; else START_MPS=0; fi &> /dev/null
-
-# Workaround on daint to avoid test hanging (See PR #1197)
-# Launch MPS from a single rank per node
-if [ $START_MPS -eq 1 ]; then
-    nvidia-cuda-mps-control -d
-fi
-# Wait for MPS to start
-sleep 5
-
 # Run the tests, only output on the first rank
 if [[ $SLURM_PROCID == "0" ]]; then
     TZ=CET date +"Run started at: %H:%M:%S %z"
@@ -33,10 +20,6 @@ else
     ctest --output-log "$CTEST_OUTPUT" -V -Q $@
 fi
 
-if [ $START_MPS -eq 1 ]; then
-    echo quit | nvidia-cuda-mps-control
-fi
-
 # Create coverage reports for code run
 if [[ "$ENABLE_COVERAGE" == "YES" ]]; then
     # On daint-mc (XC40) reduce the number of tasks to avoid out-of-memory error