Skip to content

Commit

Permalink
gpu to gh200
Browse files Browse the repository at this point in the history
  • Loading branch information
rasolca committed Dec 10, 2024
1 parent e0d2619 commit d575226
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 78 deletions.
36 changes: 18 additions & 18 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
include:
- local: 'ci/cpu/asan_ubsan_lsan.yml'
- local: 'ci/cpu/clang15_release_cxx20.yml'
- local: 'ci/cpu/clang15_release_stdexec.yml'
- local: 'ci/cpu/clang15_release.yml'
- local: 'ci/cpu/clang16_release.yml'
- local: 'ci/cpu/clang18_release.yml'
- local: 'ci/cpu/gcc11_release_stdexec.yml'
- local: 'ci/cpu/gcc11_debug_stdexec.yml'
- local: 'ci/cpu/gcc12_release_cxx20.yml'
- local: 'ci/cpu/gcc13_codecov.yml'
- local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc11_release.yml'
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
# - local: 'ci/cpu/asan_ubsan_lsan.yml'
# - local: 'ci/cpu/clang15_release_cxx20.yml'
# - local: 'ci/cpu/clang15_release_stdexec.yml'
# - local: 'ci/cpu/clang15_release.yml'
# - local: 'ci/cpu/clang16_release.yml'
# - local: 'ci/cpu/clang18_release.yml'
# - local: 'ci/cpu/gcc11_release_stdexec.yml'
# - local: 'ci/cpu/gcc11_debug_stdexec.yml'
# - local: 'ci/cpu/gcc12_release_cxx20.yml'
# - local: 'ci/cpu/gcc13_codecov.yml'
# - local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc13_release.yml'
# - local: 'ci/cuda/gcc11_release_scalapack.yml'
# - local: 'ci/cuda/gcc11_codecov.yml'
# - local: 'ci/cuda/gcc11_debug_scalapack.yml'
# - local: 'ci/cuda/gcc13_release_stdexec.yml'
# - local: 'ci/rocm/clang14_release.yml'
# - local: 'ci/rocm/clang14_release_stdexec.yml'
62 changes: 54 additions & 8 deletions ci/common-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@ stages:
variables:
FF_TIMESTAMPS: true

##
## BUILDS
##
## BUILD DEPS

.build_deps_common:
extends: .container-builder
.build_deps_common_base:
stage: build_deps
timeout: 6 hours
before_script:
Expand Down Expand Up @@ -58,11 +55,26 @@ variables:
EXTRA_APTGET: ""
CXXSTD: 17
USE_MKL: "OFF"
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
USE_CODECOV: "false"

.build_common:
extends: .container-builder
.build_deps_common:
extends:
- .container-builder-cscs-zen2
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml

.build_deps_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml
SLURM_RESERVATION: "daint"

## BUILD DLAF

.build_common_base:
stage: build
timeout: 2 hours
before_script:
Expand All @@ -85,6 +97,22 @@ variables:
paths:
- pipeline.yml

.build_common:
extends:
- .container-builder-cscs-zen2
- .build_common_base
variables:
LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"


.build_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common_base
variables:
LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"
SLURM_RESERVATION: "daint"

.build_for_daint-mc:
variables:
RUNNER: ".container-runner-daint"
Expand All @@ -106,9 +134,27 @@ variables:
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

.build_for_alps_gh200:
variables:
RUNNER: ".container-runner-todi-gh200"
SLURM_CONSTRAINT: gpu
# 64 / 2 to avoid ranks on multiple sockets for RANK6
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

## RUN

.run_common:
stage: test
trigger:
strategy: depend
forward:
pipeline_variables: true

.run_todi:
extends: .run_common
variables:
SLURM_RESERVATION: "daint"
SLURM_MPI: "pmi2"
# Workaround after update until hooks are fixed
ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn
5 changes: 2 additions & 3 deletions ci/ctest_to_gitlab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ ARTIFACTS="
"
fi

# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
BASE_TEMPLATE="
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
Expand All @@ -70,7 +69,7 @@ variables:
SLURM_EXCLUSIVE: ''
SLURM_EXACT: ''
SLURM_CONSTRAINT: $SLURM_CONSTRAINT
CRAY_CUDA_MPS: 0
CRAY_CUDA_MPS: 1
MPICH_MAX_THREAD_SAFETY: multiple
{{JOBS}}
Expand Down Expand Up @@ -104,7 +103,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do
N=`echo "$rank_label" | sed "s/RANK_//"`
C=$(( THREADS_PER_NODE / N ))
if [ $C -gt $THREADS_MAX_PER_TASK ]; then
C=$THREADS_MAX_PER_TASK
C=$THREADS_MAX_PER_TASK
fi

# Skip label combinations that match no tests
Expand Down
31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release.yml

This file was deleted.

28 changes: 28 additions & 0 deletions ci/cuda/gcc13_release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
include:
- local: 'ci/common-ci.yml'

cuda gcc13 release deps:
extends: .build_deps_common_gh200
variables:
BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04
COMPILER: gcc@13
SPACK_ENVIRONMENT: ci/docker/release-cuda-gh200.yaml
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/deps

cuda gcc13 release build:
extends:
- .build_common_gh200
- .build_for_alps_gh200
needs:
- cuda gcc13 release deps
variables:
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/dlaf:$CI_COMMIT_SHA

cuda gcc13 release test:
extends: .run_todi
needs:
- cuda gcc13 release build
trigger:
include:
- artifact: pipeline.yml
job: cuda gcc13 release build
55 changes: 55 additions & 0 deletions ci/docker/common-gh200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Distributed Linear Algebra with Future (DLAF)
#
# Copyright (c) 2018-2024, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

packages:
all:
target: [neoverse_v2]
# Set nvpl as default blas, lapack and scalapack provider.
# Can be overwritten in environments if needed.
blas:
require: 'nvpl-blas'
lapack:
require: 'nvpl-lapack'
scalapack:
require: 'netlib-scalapack'
mpi:
require: 'mpich'
blaspp:
variants:
- '~cuda'
- '~openmp'
- '~rocm'
nvpl-blas:
require:
- 'threads=openmp'
nvpl-lapack:
require:
- 'threads=openmp'
openblas:
variants:
- 'threads=openmp'
mpich:
# Fix version to have better control.
require:
- '@3.4.3'
variants:
- '~fortran'
- '~libxml2'
libfabric:
# Fix version to be compatible with version on todi
require:
- '@1.15.2'
hwloc:
variants:
- '~libxml2'
git:
# Force git as non-buildable to allow deprecated versions in environments
# https://github.com/spack/spack/pull/30040
buildable: false
3 changes: 2 additions & 1 deletion ci/docker/deploy.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"

# Automatically print stacktraces on segfault
ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
ARG LD_PRELOAD
ENV LD_PRELOAD=${LD_PRELOAD}

WORKDIR ${BUILD}
26 changes: 26 additions & 0 deletions ci/docker/release-cuda-gh200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#
# Distributed Linear Algebra with Future (DLAF)
#
# Copyright (c) 2018-2024, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

spack:
include:
- /spack_environment/common-gh200.yaml

view: false
concretizer:
unify:
true

specs:
- dla-future@master +cuda +miniapps +ci-test

packages:
all:
variants:
- 'build_type=Release'
17 changes: 0 additions & 17 deletions ci/mpi-ctest
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,6 @@ CTEST_OUTPUT="$CI_PROJECT_DIR/output/ctest.$SLURM_PROCID.txt"

pushd /DLA-Future-build > /dev/null

export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log

if which nvidia-cuda-mps-control && [ $SLURM_LOCALID = 0 ]; then START_MPS=1; else START_MPS=0; fi &> /dev/null

# Workaround on daint to avoid test hanging (See PR #1197)
# Launch MPS from a single rank per node
if [ $START_MPS -eq 1 ]; then
nvidia-cuda-mps-control -d
fi
# Wait for MPS to start
sleep 5

# Run the tests, only output on the first rank
if [[ $SLURM_PROCID == "0" ]]; then
TZ=CET date +"Run started at: %H:%M:%S %z"
Expand All @@ -33,10 +20,6 @@ else
ctest --output-log "$CTEST_OUTPUT" -V -Q $@
fi

if [ $START_MPS -eq 1 ]; then
echo quit | nvidia-cuda-mps-control
fi

# Create coverage reports for code run
if [[ "$ENABLE_COVERAGE" == "YES" ]]; then
# On daint-mc (XC40) reduce the number of tasks to avoid out-of-memory error
Expand Down

0 comments on commit d575226

Please sign in to comment.