diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f206ab859..b9b3dbc56 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,4 +25,4 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/Dockerfile.neuronx-tests . \ No newline at end of file + - run: docker build --file e2e2/test/images/neuron/Dockerfile . \ No newline at end of file diff --git a/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml b/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml index 4ed0c65b9..bf1d670e3 100644 --- a/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml +++ b/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml @@ -15,7 +15,7 @@ spec: image: {{.NeuronTestImage}} command: - /bin/bash - - ./pytorch_tests/singleNodeTest.sh + - ./tests/singleNodeTest.sh imagePullPolicy: Always resources: limits: diff --git a/e2e2/test/cases/neuron/neuron_test.go b/e2e2/test/cases/neuron/neuron_test.go index 26bf61a58..7a86b15b5 100644 --- a/e2e2/test/cases/neuron/neuron_test.go +++ b/e2e2/test/cases/neuron/neuron_test.go @@ -34,7 +34,8 @@ func TestMPIJobPytorchTraining(t *testing.T) { if *neuronTestImage == "" { t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url")) } - renderedNeuronSingleNodeManifest, err := fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ + var err error + renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ NeuronTestImage: *neuronTestImage, }) if err != nil { diff --git a/e2e2/test/images/Dockerfile.neuronx-tests b/e2e2/test/images/Dockerfile.neuronx-tests deleted file mode 100644 index 041d7609a..000000000 --- a/e2e2/test/images/Dockerfile.neuronx-tests +++ /dev/null @@ -1,5 +0,0 @@ -# Start with the Neuron base image -FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.18.2-ubuntu20.04 - -WORKDIR / -COPY e2e2/test/images/pytorch_tests/ ./pytorch_tests \ No newline at end of file diff --git a/e2e2/test/images/neuron/Dockerfile b/e2e2/test/images/neuron/Dockerfile new file mode 100644 index 000000000..8f24a1592 --- /dev/null +++ b/e2e2/test/images/neuron/Dockerfile @@ -0,0 +1,182 @@ +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +# Neuron SDK components version numbers +ARG NEURONX_DISTRIBUTED_VERSION=0.7.0 +ARG NEURONX_CC_VERSION=2.13.72.0 +ARG NEURONX_FRAMEWORK_VERSION=2.1.2.2.1.0 +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.20.22.0-c101c322e +ARG NEURONX_RUNTIME_LIB_VERSION=2.20.22.0-1b3ca6425 +ARG NEURONX_TOOLS_VERSION=2.17.1.0 + +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 +ARG OMPI_VERSION=4.1.5 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH /opt/aws/neuron/bin/:$PATH +# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + software-properties-common \ + wget \ + unzip \ + vim \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Open MPI +RUN mkdir -p /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +RUN ${PIP} install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "awscli<2" \ + scipy \ + click \ + "cryptography" \ + psutil==5.6.7 \ + dataset \ + transformers==4.36.2 \ + Pillow + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt +RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com + +# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 +# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. +# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 +RUN ${PIP} install --no-cache-dir -U \ + "attrs<24,>=23.1.0" \ + "protobuf>=3.18.3,<=3.20.3" \ + "docutils>=0.10,<0.17" \ + "rsa<4.8,>=3.1.2" \ + "urllib3>=1.26.0,<1.27" + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME + + +# Clean up after apt update +RUN rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install some common packages used by training scripts +# torchvision needed for MLP. since it depends on torch and torch neuron/torch +# is already installed install it with nodeps +RUN pip3 install --no-cache-dir --no-deps -U \ + torchvision==0.16.* + + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.1/license.txt + +COPY e2e2/test/images/neuron/tests ./tests \ No newline at end of file diff --git a/e2e2/test/images/neuron/tests/singleNodeTest.sh b/e2e2/test/images/neuron/tests/singleNodeTest.sh new file mode 100755 index 000000000..9828efa74 --- /dev/null +++ b/e2e2/test/images/neuron/tests/singleNodeTest.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronSingleAllReduce.py +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronParallelState.py +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronMlp.py \ No newline at end of file diff --git a/e2e2/test/images/pytorch_tests/testNeuronMlp.py b/e2e2/test/images/neuron/tests/testNeuronMlp.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronMlp.py rename to e2e2/test/images/neuron/tests/testNeuronMlp.py diff --git a/e2e2/test/images/pytorch_tests/testNeuronParallelState.py b/e2e2/test/images/neuron/tests/testNeuronParallelState.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronParallelState.py rename to e2e2/test/images/neuron/tests/testNeuronParallelState.py diff --git a/e2e2/test/images/pytorch_tests/testNeuronSingleAllReduce.py b/e2e2/test/images/neuron/tests/testNeuronSingleAllReduce.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronSingleAllReduce.py rename to e2e2/test/images/neuron/tests/testNeuronSingleAllReduce.py diff --git a/e2e2/test/images/pytorch_tests/singleNodeTest.sh b/e2e2/test/images/pytorch_tests/singleNodeTest.sh deleted file mode 100755 index ba4e77da1..000000000 --- a/e2e2/test/images/pytorch_tests/singleNodeTest.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronSingleAllReduce.py -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronParallelState.py -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronMlp.py \ No newline at end of file