From 111c57cd38844f80e49bc16285d08b3f41f73e96 Mon Sep 17 00:00:00 2001 From: Weicong Wang Date: Thu, 6 Jun 2024 21:20:22 +0000 Subject: [PATCH 1/2] Update aws-efa-nccl-tests docker file to the latest cuda and aws-ofi-nccl version. --- e2e2/test/images/Dockerfile.aws-efa-nccl-tests | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e2/test/images/Dockerfile.aws-efa-nccl-tests b/e2e2/test/images/Dockerfile.aws-efa-nccl-tests index 0b2b638f8..09d9fb15d 100644 --- a/e2e2/test/images/Dockerfile.aws-efa-nccl-tests +++ b/e2e2/test/images/Dockerfile.aws-efa-nccl-tests @@ -1,8 +1,9 @@ -FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 +# Start with the NVIDIA CUDA base image +FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 ARG EFA_INSTALLER_VERSION=latest # 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 -ARG AWS_OFI_NCCL_VERSION=1.7.4 +ARG AWS_OFI_NCCL_VERSION=1.9.1 ARG NCCL_TESTS_VERSION=master # Install necessary dependencies From 8ddfff6fba3d6593e40c7d3b0ffc84dd80d040cb Mon Sep 17 00:00:00 2001 From: Weicong Wang Date: Thu, 6 Jun 2024 21:39:20 +0000 Subject: [PATCH 2/2] Add the NCCL image build step to the PR checks --- .github/workflows/ci.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5cb903d30..d24950e73 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,4 +15,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile.kubetest2 . \ No newline at end of file + - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile.kubetest2 . + build-nccl: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests . \ No newline at end of file