diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1247571d2..926714489 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,11 +16,6 @@ jobs: steps: - uses: actions/checkout@v3 - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile.kubetest2 . - build-nccl: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests . build-neuronx: runs-on: ubuntu-latest steps: diff --git a/e2e2/test/cases/neuron/neuron_test.go b/e2e2/test/cases/neuron/neuron_test.go index 7a86b15b5..35d5bdc46 100644 --- a/e2e2/test/cases/neuron/neuron_test.go +++ b/e2e2/test/cases/neuron/neuron_test.go @@ -32,7 +32,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if *neuronTestImage == "" { - t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url")) + t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile to build the image and -neuronTestImage to set the image url")) } var err error renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ diff --git a/e2e2/test/cases/nvidia/main_test.go b/e2e2/test/cases/nvidia/main_test.go index 2baa03149..0dc1ed048 100644 --- a/e2e2/test/cases/nvidia/main_test.go +++ b/e2e2/test/cases/nvidia/main_test.go @@ -23,13 +23,13 @@ import ( ) var ( - testenv env.Environment - nodeType *string - efaEnabled *bool - ncclTestImage *string - nodeCount int - gpuPerNode int - efaPerNode int + testenv env.Environment + nodeType *string + efaEnabled *bool + nvidiaTestImage *string + nodeCount int + gpuPerNode int + efaPerNode int ) var ( @@ -43,7 +43,7 @@ var ( func TestMain(m *testing.M) { nodeType = flag.String("nodeType", "", "node type for the tests") - ncclTestImage = flag.String("ncclTestImage", "", "nccl test image for nccl tests") + nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests") efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests") cfg, err := envconf.NewFromFlags() if err != nil { diff --git a/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml b/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml index 1af096145..4a2c46b6f 100644 --- a/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml +++ b/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml @@ -16,7 +16,7 @@ spec: spec: restartPolicy: OnFailure containers: - - image: {{.NcclTestImage}} + - image: {{.NvidiaTestImage}} imagePullPolicy: Always name: nccl-test-launcher env: @@ -82,7 +82,7 @@ spec: emptyDir: medium: Memory containers: - - image: {{.NcclTestImage}} + - image: {{.NvidiaTestImage}} imagePullPolicy: Always name: nccl-test-worker volumeMounts: diff --git a/e2e2/test/cases/nvidia/mpi_test.go b/e2e2/test/cases/nvidia/mpi_test.go index 46ed24c67..2d26b972f 100644 --- a/e2e2/test/cases/nvidia/mpi_test.go +++ b/e2e2/test/cases/nvidia/mpi_test.go @@ -31,7 +31,7 @@ type ncclTestManifestTplVars struct { WorkerNodeCount int WorkerNodeGpuCount int GpuPerNode int - NcclTestImage string + NvidiaTestImage string EfaInterfacePerNode int EfaUseDeviceRdma int } @@ -77,8 +77,8 @@ func TestMPIJobPytorchTraining(t *testing.T) { WithLabel("hardware", "gpu"). WithLabel("hardware", "efa"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - if *ncclTestImage == "" { - t.Fatal(fmt.Errorf("efaImage must be set to run nccl test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.aws-efa-nccl-tests to build the image and -efaImage to set the image url")) + if *nvidiaTestImage == "" { + t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url")) } // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl-base.html#nccl-start-base-test var EfaUseDeviceRdma int @@ -90,7 +90,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { WorkerNodeCount: nodeCount - 1, WorkerNodeGpuCount: (nodeCount - 1) * gpuPerNode, GpuPerNode: gpuPerNode, - NcclTestImage: *ncclTestImage, + NvidiaTestImage: *nvidiaTestImage, EfaInterfacePerNode: efaPerNode, EfaUseDeviceRdma: EfaUseDeviceRdma, }) diff --git a/e2e2/test/cases/nvidia/unit_test.go b/e2e2/test/cases/nvidia/unit_test.go new file mode 100644 index 000000000..d4cdb5008 --- /dev/null +++ b/e2e2/test/cases/nvidia/unit_test.go @@ -0,0 +1,71 @@ +package nvidia + +import ( + "context" + _ "embed" + "fmt" + "testing" + "time" + + fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions" + "sigs.k8s.io/e2e-framework/klient/wait" + "sigs.k8s.io/e2e-framework/pkg/envconf" + "sigs.k8s.io/e2e-framework/pkg/features" + + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + //go:embed manifests/job-unit-test-single-node.yaml + jobUnitTestSingleNodeManifest []byte + renderedJobUnitTestSingleNodeManifest []byte +) + +type unitTestManifestTplVars struct { + NvidiaTestImage string +} + +func TestSingleNodeUnitTest(t *testing.T) { + unitTest := features.New("unit-test"). + WithLabel("suite", "nvidia"). + WithLabel("hardware", "gpu"). + Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + if *nvidiaTestImage == "" { + t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url")) + } + var err error + renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{ + NvidiaTestImage: *nvidiaTestImage, + }) + if err != nil { + t.Fatal(err) + } + err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest) + if err != nil { + t.Fatal(err) + } + return ctx + }). + Assess("Unit test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, + } + err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), + wait.WithTimeout(time.Minute*20)) + if err != nil { + t.Fatal(err) + } + return ctx + }). + Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest) + if err != nil { + t.Fatal(err) + } + return ctx + }). + Feature() + + testenv.Test(t, unitTest) +} diff --git a/e2e2/test/images/Dockerfile.aws-efa-nccl-tests b/e2e2/test/images/Dockerfile.aws-efa-nccl-tests deleted file mode 100644 index 09d9fb15d..000000000 --- a/e2e2/test/images/Dockerfile.aws-efa-nccl-tests +++ /dev/null @@ -1,93 +0,0 @@ -# Start with the NVIDIA CUDA base image -FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 - -ARG EFA_INSTALLER_VERSION=latest -# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 -ARG AWS_OFI_NCCL_VERSION=1.9.1 -ARG NCCL_TESTS_VERSION=master - -# Install necessary dependencies -RUN apt-get update -y -RUN apt-get remove -y --allow-change-held-packages \ - libmlx5-1 \ - ibverbs-utils \ - libibverbs-dev \ - libibverbs1 \ - libnccl2 \ - libnccl-dev - -RUN rm -rf /opt/hpcx \ - && rm -rf /usr/local/mpi \ - && rm -rf /usr/local/ucx \ - && rm -f /etc/ld.so.conf.d/hpcx.conf \ - && ldconfig - -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - sudo \ - git \ - gcc \ - vim \ - kmod \ - openssh-client \ - openssh-server \ - build-essential \ - wget curl \ - autoconf \ - libtool \ - gdb \ - automake \ - python3-distutils \ - cmake \ - apt-utils \ - devscripts \ - debhelper \ - libsubunit-dev \ - check \ - pkg-config \ - libhwloc-dev - -RUN mkdir -p /var/run/sshd -RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ - echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config -ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH -ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH - -# Install EFA -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && rm -rf $HOME/aws-efa-installer - -# Install NCCL -RUN apt-key del 7fa2af80 \ - && curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 - -## Install AWS-OFI-NCCL plugin -RUN export OPAL_PREFIX="" \ - && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ - && cd /opt/aws-ofi-nccl \ - && git checkout v${AWS_OFI_NCCL_VERSION}-aws \ - && ./autogen.sh \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-libfabric=/opt/amazon/efa/ \ - --with-cuda=/usr/local/cuda \ - --with-mpi=/opt/amazon/openmpi/ \ - && make && make install - -# Install NCCL Tests -RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ - && cd /opt/nccl-tests \ - && git checkout ${NCCL_TESTS_VERSION} \ - && make MPI=1 \ - MPI_HOME=/opt/amazon/openmpi/ \ - CUDA_HOME=/usr/local/cuda - -# Set a default command for debugging or modify as per requirements -ENV NCCL_PROTO simple -RUN rm -rf /var/lib/apt/lists/* -ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD \ No newline at end of file