From d575226914c9d27d90d5076d6284ffc7d558c27a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Raffaele=20Solc=C3=A0?= Date: Mon, 23 Sep 2024 17:58:37 +0200 Subject: [PATCH] gpu to gh200 --- ci/.gitlab-ci.yml | 36 +++++++++--------- ci/common-ci.yml | 62 +++++++++++++++++++++++++++---- ci/ctest_to_gitlab.sh | 5 +-- ci/cuda/gcc11_release.yml | 31 ---------------- ci/cuda/gcc13_release.yml | 28 ++++++++++++++ ci/docker/common-gh200.yaml | 55 +++++++++++++++++++++++++++ ci/docker/deploy.Dockerfile | 3 +- ci/docker/release-cuda-gh200.yaml | 26 +++++++++++++ ci/mpi-ctest | 17 --------- 9 files changed, 185 insertions(+), 78 deletions(-) delete mode 100644 ci/cuda/gcc11_release.yml create mode 100644 ci/cuda/gcc13_release.yml create mode 100644 ci/docker/common-gh200.yaml create mode 100644 ci/docker/release-cuda-gh200.yaml diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index 0204a001ee..c8f6a96fba 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -1,19 +1,19 @@ include: - - local: 'ci/cpu/asan_ubsan_lsan.yml' - - local: 'ci/cpu/clang15_release_cxx20.yml' - - local: 'ci/cpu/clang15_release_stdexec.yml' - - local: 'ci/cpu/clang15_release.yml' - - local: 'ci/cpu/clang16_release.yml' - - local: 'ci/cpu/clang18_release.yml' - - local: 'ci/cpu/gcc11_release_stdexec.yml' - - local: 'ci/cpu/gcc11_debug_stdexec.yml' - - local: 'ci/cpu/gcc12_release_cxx20.yml' - - local: 'ci/cpu/gcc13_codecov.yml' - - local: 'ci/cpu/gcc13_release.yml' - - local: 'ci/cuda/gcc11_release.yml' - - local: 'ci/cuda/gcc11_release_scalapack.yml' - - local: 'ci/cuda/gcc11_codecov.yml' - - local: 'ci/cuda/gcc11_debug_scalapack.yml' - - local: 'ci/cuda/gcc13_release_stdexec.yml' - - local: 'ci/rocm/clang14_release.yml' - - local: 'ci/rocm/clang14_release_stdexec.yml' +# - local: 'ci/cpu/asan_ubsan_lsan.yml' +# - local: 'ci/cpu/clang15_release_cxx20.yml' +# - local: 'ci/cpu/clang15_release_stdexec.yml' +# - local: 'ci/cpu/clang15_release.yml' +# - local: 'ci/cpu/clang16_release.yml' +# - local: 'ci/cpu/clang18_release.yml' +# - local: 'ci/cpu/gcc11_release_stdexec.yml' +# - local: 'ci/cpu/gcc11_debug_stdexec.yml' +# - local: 'ci/cpu/gcc12_release_cxx20.yml' +# - local: 'ci/cpu/gcc13_codecov.yml' +# - local: 'ci/cpu/gcc13_release.yml' + - local: 'ci/cuda/gcc13_release.yml' +# - local: 'ci/cuda/gcc11_release_scalapack.yml' +# - local: 'ci/cuda/gcc11_codecov.yml' +# - local: 'ci/cuda/gcc11_debug_scalapack.yml' +# - local: 'ci/cuda/gcc13_release_stdexec.yml' +# - local: 'ci/rocm/clang14_release.yml' +# - local: 'ci/rocm/clang14_release_stdexec.yml' diff --git a/ci/common-ci.yml b/ci/common-ci.yml index 396f03efac..d3eb156d67 100644 --- a/ci/common-ci.yml +++ b/ci/common-ci.yml @@ -9,12 +9,9 @@ stages: variables: FF_TIMESTAMPS: true -## -## BUILDS -## +## BUILD DEPS -.build_deps_common: - extends: .container-builder +.build_deps_common_base: stage: build_deps timeout: 6 hours before_script: @@ -58,11 +55,26 @@ variables: EXTRA_APTGET: "" CXXSTD: 17 USE_MKL: "OFF" - COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml USE_CODECOV: "false" -.build_common: - extends: .container-builder +.build_deps_common: + extends: + - .container-builder-cscs-zen2 + - .build_deps_common_base + variables: + COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml + +.build_deps_common_gh200: + extends: + - .container-builder-cscs-gh200 + - .build_deps_common_base + variables: + COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml + SLURM_RESERVATION: "daint" + +## BUILD DLAF + +.build_common_base: stage: build timeout: 2 hours before_script: @@ -85,6 +97,22 @@ variables: paths: - pipeline.yml +.build_common: + extends: + - .container-builder-cscs-zen2 + - .build_common_base + variables: + LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so" + + +.build_common_gh200: + extends: + - .container-builder-cscs-gh200 + - .build_common_base + variables: + LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so" + SLURM_RESERVATION: "daint" + .build_for_daint-mc: variables: RUNNER: ".container-runner-daint" @@ -106,9 +134,27 @@ variables: THREADS_MAX_PER_TASK: 32 THREADS_PER_NODE: 256 +.build_for_alps_gh200: + variables: + RUNNER: ".container-runner-todi-gh200" + SLURM_CONSTRAINT: gpu + # 64 / 2 to avoid ranks on multiple sockets for RANK6 + THREADS_MAX_PER_TASK: 32 + THREADS_PER_NODE: 256 + +## RUN + .run_common: stage: test trigger: strategy: depend forward: pipeline_variables: true + +.run_todi: + extends: .run_common + variables: + SLURM_RESERVATION: "daint" + SLURM_MPI: "pmi2" + # Workaround after update until hooks are fixed + ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn diff --git a/ci/ctest_to_gitlab.sh b/ci/ctest_to_gitlab.sh index 99b2e7a9f3..b19461494f 100755 --- a/ci/ctest_to_gitlab.sh +++ b/ci/ctest_to_gitlab.sh @@ -55,7 +55,6 @@ ARTIFACTS=" " fi -# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197) BASE_TEMPLATE=" include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' @@ -70,7 +69,7 @@ variables: SLURM_EXCLUSIVE: '' SLURM_EXACT: '' SLURM_CONSTRAINT: $SLURM_CONSTRAINT - CRAY_CUDA_MPS: 0 + CRAY_CUDA_MPS: 1 MPICH_MAX_THREAD_SAFETY: multiple {{JOBS}} @@ -104,7 +103,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do N=`echo "$rank_label" | sed "s/RANK_//"` C=$(( THREADS_PER_NODE / N )) if [ $C -gt $THREADS_MAX_PER_TASK ]; then - C=$THREADS_MAX_PER_TASK + C=$THREADS_MAX_PER_TASK fi # Skip label combinations that match no tests diff --git a/ci/cuda/gcc11_release.yml b/ci/cuda/gcc11_release.yml deleted file mode 100644 index 93799b4b66..0000000000 --- a/ci/cuda/gcc11_release.yml +++ /dev/null @@ -1,31 +0,0 @@ -include: - - local: 'ci/common-ci.yml' - -cuda gcc11 release deps: - extends: .build_deps_common - variables: - BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04 - COMPILER: gcc@11 - SPACK_ENVIRONMENT: ci/docker/release-cuda.yaml - USE_MKL: "ON" - DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deps - -cuda gcc11 release build: - extends: - - .build_common - - .build_for_daint-gpu - needs: - - cuda gcc11 release deps - variables: - DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/dlaf:$CI_COMMIT_SHA - -cuda gcc11 release test: - extends: .run_common - variables: - PIKA_MPI_ENABLE_POOL: 1 - needs: - - cuda gcc11 release build - trigger: - include: - - artifact: pipeline.yml - job: cuda gcc11 release build diff --git a/ci/cuda/gcc13_release.yml b/ci/cuda/gcc13_release.yml new file mode 100644 index 0000000000..18c340cdf9 --- /dev/null +++ b/ci/cuda/gcc13_release.yml @@ -0,0 +1,28 @@ +include: + - local: 'ci/common-ci.yml' + +cuda gcc13 release deps: + extends: .build_deps_common_gh200 + variables: + BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04 + COMPILER: gcc@13 + SPACK_ENVIRONMENT: ci/docker/release-cuda-gh200.yaml + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/deps + +cuda gcc13 release build: + extends: + - .build_common_gh200 + - .build_for_alps_gh200 + needs: + - cuda gcc13 release deps + variables: + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/dlaf:$CI_COMMIT_SHA + +cuda gcc13 release test: + extends: .run_todi + needs: + - cuda gcc13 release build + trigger: + include: + - artifact: pipeline.yml + job: cuda gcc13 release build diff --git a/ci/docker/common-gh200.yaml b/ci/docker/common-gh200.yaml new file mode 100644 index 0000000000..a64798b84f --- /dev/null +++ b/ci/docker/common-gh200.yaml @@ -0,0 +1,55 @@ +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +packages: + all: + target: [neoverse_v2] + # Set nvpl as default blas, lapack and scalapack provider. + # Can be overwritten in environments if needed. + blas: + require: 'nvpl-blas' + lapack: + require: 'nvpl-lapack' + scalapack: + require: 'netlib-scalapack' + mpi: + require: 'mpich' + blaspp: + variants: + - '~cuda' + - '~openmp' + - '~rocm' + nvpl-blas: + require: + - 'threads=openmp' + nvpl-lapack: + require: + - 'threads=openmp' + openblas: + variants: + - 'threads=openmp' + mpich: + # Fix version to have better control. + require: + - '@3.4.3' + variants: + - '~fortran' + - '~libxml2' + libfabric: + # Fix version to be compatible with version on todi + require: + - '@1.15.2' + hwloc: + variants: + - '~libxml2' + git: + # Force git as non-buildable to allow deprecated versions in environments + # https://github.com/spack/spack/pull/30040 + buildable: false diff --git a/ci/docker/deploy.Dockerfile b/ci/docker/deploy.Dockerfile index 42a11b2264..12e907276a 100644 --- a/ci/docker/deploy.Dockerfile +++ b/ci/docker/deploy.Dockerfile @@ -41,6 +41,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES compute,utility ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2" # Automatically print stacktraces on segfault -ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so +ARG LD_PRELOAD +ENV LD_PRELOAD=${LD_PRELOAD} WORKDIR ${BUILD} diff --git a/ci/docker/release-cuda-gh200.yaml b/ci/docker/release-cuda-gh200.yaml new file mode 100644 index 0000000000..a7faf67ff2 --- /dev/null +++ b/ci/docker/release-cuda-gh200.yaml @@ -0,0 +1,26 @@ +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +spack: + include: + - /spack_environment/common-gh200.yaml + + view: false + concretizer: + unify: + true + + specs: + - dla-future@master +cuda +miniapps +ci-test + + packages: + all: + variants: + - 'build_type=Release' diff --git a/ci/mpi-ctest b/ci/mpi-ctest index 7eefed90f3..d8f7689e4a 100755 --- a/ci/mpi-ctest +++ b/ci/mpi-ctest @@ -11,19 +11,6 @@ CTEST_OUTPUT="$CI_PROJECT_DIR/output/ctest.$SLURM_PROCID.txt" pushd /DLA-Future-build > /dev/null -export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps -export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log - -if which nvidia-cuda-mps-control && [ $SLURM_LOCALID = 0 ]; then START_MPS=1; else START_MPS=0; fi &> /dev/null - -# Workaround on daint to avoid test hanging (See PR #1197) -# Launch MPS from a single rank per node -if [ $START_MPS -eq 1 ]; then - nvidia-cuda-mps-control -d -fi -# Wait for MPS to start -sleep 5 - # Run the tests, only output on the first rank if [[ $SLURM_PROCID == "0" ]]; then TZ=CET date +"Run started at: %H:%M:%S %z" @@ -33,10 +20,6 @@ else ctest --output-log "$CTEST_OUTPUT" -V -Q $@ fi -if [ $START_MPS -eq 1 ]; then - echo quit | nvidia-cuda-mps-control -fi - # Create coverage reports for code run if [[ "$ENABLE_COVERAGE" == "YES" ]]; then # On daint-mc (XC40) reduce the number of tasks to avoid out-of-memory error