From 82dd5293f7b530769cb1dac660a38144b75eef0b Mon Sep 17 00:00:00 2001 From: Weicong Wang Date: Tue, 2 Jul 2024 02:24:27 +0000 Subject: [PATCH] Fork the neuron DLC dockerfile --- .github/workflows/ci.yaml | 2 +- .../manifests/single-node-test-neuronx.yaml | 2 +- e2e2/test/cases/neuron/neuron_test.go | 3 +- e2e2/test/images/neuron/Dockerfile | 193 +++++++++ .../images/neuron/common/changehostname.c | 33 ++ .../neuron/common/deep_learning_container.py | 365 ++++++++++++++++++ .../common/start_with_right_hostname.sh | 24 ++ .../images/neuron/tests/singleNodeTest.sh | 5 + .../tests}/testNeuronMlp.py | 0 .../tests}/testNeuronParallelState.py | 0 .../tests}/testNeuronSingleAllReduce.py | 0 .../images/pytorch_tests/singleNodeTest.sh | 5 - 12 files changed, 624 insertions(+), 8 deletions(-) create mode 100644 e2e2/test/images/neuron/Dockerfile create mode 100644 e2e2/test/images/neuron/common/changehostname.c create mode 100644 e2e2/test/images/neuron/common/deep_learning_container.py create mode 100644 e2e2/test/images/neuron/common/start_with_right_hostname.sh create mode 100755 e2e2/test/images/neuron/tests/singleNodeTest.sh rename e2e2/test/images/{pytorch_tests => neuron/tests}/testNeuronMlp.py (100%) rename e2e2/test/images/{pytorch_tests => neuron/tests}/testNeuronParallelState.py (100%) rename e2e2/test/images/{pytorch_tests => neuron/tests}/testNeuronSingleAllReduce.py (100%) delete mode 100755 e2e2/test/images/pytorch_tests/singleNodeTest.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f206ab859..b9b3dbc56 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,4 +25,4 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/Dockerfile.neuronx-tests . \ No newline at end of file + - run: docker build --file e2e2/test/images/neuron/Dockerfile . \ No newline at end of file diff --git a/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml b/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml index 4ed0c65b9..bf1d670e3 100644 --- a/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml +++ b/e2e2/test/cases/neuron/manifests/single-node-test-neuronx.yaml @@ -15,7 +15,7 @@ spec: image: {{.NeuronTestImage}} command: - /bin/bash - - ./pytorch_tests/singleNodeTest.sh + - ./tests/singleNodeTest.sh imagePullPolicy: Always resources: limits: diff --git a/e2e2/test/cases/neuron/neuron_test.go b/e2e2/test/cases/neuron/neuron_test.go index 26bf61a58..7a86b15b5 100644 --- a/e2e2/test/cases/neuron/neuron_test.go +++ b/e2e2/test/cases/neuron/neuron_test.go @@ -34,7 +34,8 @@ func TestMPIJobPytorchTraining(t *testing.T) { if *neuronTestImage == "" { t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url")) } - renderedNeuronSingleNodeManifest, err := fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ + var err error + renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ NeuronTestImage: *neuronTestImage, }) if err != nil { diff --git a/e2e2/test/images/neuron/Dockerfile b/e2e2/test/images/neuron/Dockerfile new file mode 100644 index 000000000..8a3f54ff6 --- /dev/null +++ b/e2e2/test/images/neuron/Dockerfile @@ -0,0 +1,193 @@ +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +# Neuron SDK components version numbers +ARG NEURONX_DISTRIBUTED_VERSION=0.7.0 +ARG NEURONX_CC_VERSION=2.13.72.0 +ARG NEURONX_FRAMEWORK_VERSION=2.1.2.2.1.0 +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.20.22.0-c101c322e +ARG NEURONX_RUNTIME_LIB_VERSION=2.20.22.0-1b3ca6425 +ARG NEURONX_TOOLS_VERSION=2.17.1.0 + +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 +ARG OMPI_VERSION=4.1.5 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH /opt/aws/neuron/bin/:$PATH +# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + software-properties-common \ + wget \ + unzip \ + vim \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Open MPI +RUN mkdir -p /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Copy workaround script for incorrect hostname +COPY e2e2/test/images/neuron/common/changehostname.c / +COPY e2e2/test/images/neuron/common/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +COPY e2e2/test/images/neuron/common/deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY e2e2/test/images/neuron/tests ./tests + +RUN ${PIP} install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "awscli<2" \ + scipy \ + click \ + "cryptography" \ + psutil==5.6.7 \ + dataset \ + transformers==4.36.2 \ + Pillow + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt +RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com + +# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 +# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. +# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 +RUN ${PIP} install --no-cache-dir -U \ + "attrs<24,>=23.1.0" \ + "protobuf>=3.18.3,<=3.20.3" \ + "docutils>=0.10,<0.17" \ + "rsa<4.8,>=3.1.2" \ + "python-etcd" \ + "urllib3>=1.26.0,<1.27" + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME + + +# Clean up after apt update +RUN rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install some common packages used by training scripts +# torchvision needed for MLP. since it depends on torch and torch neuron/torch +# is already installed install it with nodeps +RUN pip3 install --no-cache-dir --no-deps -U \ + torchvision==0.16.* + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.1/license.txt + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/e2e2/test/images/neuron/common/changehostname.c b/e2e2/test/images/neuron/common/changehostname.c new file mode 100644 index 000000000..0db22099d --- /dev/null +++ b/e2e2/test/images/neuron/common/changehostname.c @@ -0,0 +1,33 @@ +#include +#include + +/** + * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You + * may not use this file except in compliance with the License. A copy of + * the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF + * ANY KIND, either express or implied. See the License for the specific + * language governing permissions and limitations under the License. + */ + +/** + * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker. + * + * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host, + * not realizing that it needs to use NET/Socket. + * + * When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json + * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library. + */ +int gethostname(char *name, size_t len) +{ + const char *val = PLACEHOLDER_HOSTNAME; + strncpy(name, val, len); + return 0; +} diff --git a/e2e2/test/images/neuron/common/deep_learning_container.py b/e2e2/test/images/neuron/common/deep_learning_container.py new file mode 100644 index 000000000..207df7d6b --- /dev/null +++ b/e2e2/test/images/neuron/common/deep_learning_container.py @@ -0,0 +1,365 @@ +# Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import argparse +import json +import logging +import multiprocessing +import os +import re +import signal +import sys + +import botocore.session +import requests + +TIMEOUT_SECS = 5 + + +def requests_helper(url, headers=None, timeout=0.1): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param headers: str, headers needed to make a request + :param timeout: float, timeout value for a request + """ + response = None + try: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + return response + + +def requests_helper_imds(url, token=None): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param token: str, token is needed to use imdsv2 + """ + response_text = None + response = None + headers = None + if token: + headers = {"X-aws-ec2-metadata-token": token} + timeout = 1 + try: + while timeout <= 3: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + if response: + break + timeout += 1 + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + response_text = response.text + + return response_text + + +def get_imdsv2_token(): + """ + Retrieve token using imdsv2 service + """ + response = None + token = None + headers = {"X-aws-ec2-metadata-token-ttl-seconds": "600"} + url = "http://169.254.169.254/latest/api/token" + timeout = 1 + + try: + while timeout <= 3: + response = requests.put(url, headers=headers, timeout=timeout) + if response: + break + timeout += 1 + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + token = response.text + + return token + + +def _validate_instance_id(instance_id): + """ + Validate instance ID + """ + instance_id_regex = r"^(i-\S{17})" + compiled_regex = re.compile(instance_id_regex) + match = compiled_regex.match(instance_id) + + if not match: + return None + + return match.group(1) + + +def _retrieve_instance_id(token=None): + """ + Retrieve instance ID from instance metadata service + """ + instance_id = None + instance_url = "http://169.254.169.254/latest/meta-data/instance-id" + + if token: + instance_id = requests_helper_imds(instance_url, token) + else: + instance_id = requests_helper_imds(instance_url) + + if instance_id: + instance_id = _validate_instance_id(instance_id) + + return instance_id + + +def _retrieve_instance_region(token=None): + """ + Retrieve instance region from instance metadata service + """ + region = None + response_json = None + valid_regions = [ + "ap-northeast-1", + "ap-northeast-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + ] + + region_url = "http://169.254.169.254/latest/dynamic/instance-identity/document" + + if token: + response_text = requests_helper_imds(region_url, token) + else: + response_text = requests_helper_imds(region_url) + + if response_text: + response_json = json.loads(response_text) + + if response_json["region"] in valid_regions: + region = response_json["region"] + + return region + + +def _retrieve_device(): + return ( + "gpu" + if os.path.isdir("/usr/local/cuda") + else "eia" + if os.path.isdir("/opt/ei_tools") + else "neuron" + if os.path.exists("/usr/local/bin/tensorflow_model_server_neuron") + else "cpu" + ) + + +def _retrieve_cuda(): + cuda_version = "" + try: + cuda_path = os.path.basename(os.readlink("/usr/local/cuda")) + cuda_version_search = re.search(r"\d+\.\d+", cuda_path) + cuda_version = "" if not cuda_version_search else cuda_version_search.group() + except Exception as e: + logging.error(f"Failed to get cuda path: {e}") + return cuda_version + + +def _retrieve_os(): + version = "" + name = "" + with open("/etc/os-release", "r") as f: + for line in f.readlines(): + if re.match(r"^ID=\w+$", line): + name = re.search(r"^ID=(\w+)$", line).group(1) + if re.match(r'^VERSION_ID="\d+\.\d+"$', line): + version = re.search(r'^VERSION_ID="(\d+\.\d+)"$', line).group(1) + return name + version + + +def parse_args(): + """ + Parsing function to parse input arguments. + Return: args, which containers parsed input arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--framework", + choices=["tensorflow", "mxnet", "pytorch"], + help="framework of container image.", + required=True, + ) + parser.add_argument( + "--framework-version", help="framework version of container image.", required=True + ) + parser.add_argument( + "--container-type", + choices=["training", "inference"], + help="What kind of jobs you want to run on container. Either training or inference.", + required=True, + ) + + args, _unknown = parser.parse_known_args() + + fw_version_pattern = r"\d+(\.\d+){1,2}(-rc\d)?" + + # PT 1.10 and above has +cpu or +cu113 string, so handle accordingly + if args.framework == "pytorch": + pt_fw_version_pattern = r"(\d+(\.\d+){1,2}(-rc\d)?)((\+cpu)|(\+cu\d{3})|(a0\+git\w{7}))" + pt_fw_version_match = re.fullmatch(pt_fw_version_pattern, args.framework_version) + if pt_fw_version_match: + args.framework_version = pt_fw_version_match.group(1) + assert re.fullmatch(fw_version_pattern, args.framework_version), ( + f"args.framework_version = {args.framework_version} does not match {fw_version_pattern}\n" + f"Please specify framework version as X.Y.Z or X.Y." + ) + # TFS 2.12.1 still uses TF 2.12.0 and breaks the telemetry check as it is checking TF version + # instead of TFS version. WE are forcing the version we want. + if ( + args.framework == "tensorflow" + and args.container_type == "inference" + and args.framework_version == "2.12.0" + ): + args.framework_version = "2.12.1" + + return args + + +def query_bucket(instance_id, region): + """ + GET request on an empty object from an Amazon S3 bucket + """ + response = None + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + py_version = sys.version.split(" ")[0] + + if instance_id is not None and region is not None: + url = ( + "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" + "/dlc-containers-{1}.txt?x-instance-id={1}&x-framework={2}&x-framework_version={3}&x-py_version={4}&x-container_type={5}".format( + region, instance_id, framework, framework_version, py_version, container_type + ) + ) + response = requests_helper(url, timeout=0.2) + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_request.txt"), "w+") as rf: + rf.write(url) + + logging.debug("Query bucket finished: {}".format(response)) + + return response + + +def tag_instance(instance_id, region): + """ + Apply instance tag on the instance that is running the container using botocore + """ + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + py_version = sys.version.split(" ")[0] + device = _retrieve_device() + cuda_version = f"_cuda{_retrieve_cuda()}" if device == "gpu" else "" + os_version = _retrieve_os() + + tag = f"{framework}_{container_type}_{framework_version}_python{py_version}_{device}{cuda_version}_{os_version}" + tag_struct = {"Key": "aws-dlc-autogenerated-tag-do-not-delete", "Value": tag} + + request_status = None + if instance_id and region: + try: + session = botocore.session.get_session() + ec2_client = session.create_client("ec2", region_name=region) + response = ec2_client.create_tags(Resources=[instance_id], Tags=[tag_struct]) + request_status = response.get("ResponseMetadata").get("HTTPStatusCode") + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_tag_request.txt"), "w+") as rf: + rf.write(json.dumps(tag_struct, indent=4)) + except Exception as e: + logging.error(f"Error. {e}") + logging.debug("Instance tagged successfully: {}".format(request_status)) + else: + logging.error("Failed to retrieve instance_id or region") + + return request_status + + +def main(): + """ + Invoke bucket query + """ + # Logs are not necessary for normal run. Remove this line while debugging. + logging.getLogger().disabled = True + + logging.basicConfig(level=logging.ERROR) + token = None + instance_id = None + region = None + token = get_imdsv2_token() + if token: + instance_id = _retrieve_instance_id(token) + region = _retrieve_instance_region(token) + else: + instance_id = _retrieve_instance_id() + region = _retrieve_instance_region() + + bucket_process = multiprocessing.Process(target=query_bucket, args=(instance_id, region)) + tag_process = multiprocessing.Process(target=tag_instance, args=(instance_id, region)) + + bucket_process.start() + tag_process.start() + + tag_process.join(TIMEOUT_SECS) + bucket_process.join(TIMEOUT_SECS) + + if tag_process.is_alive(): + os.kill(tag_process.pid, signal.SIGKILL) + tag_process.join() + if bucket_process.is_alive(): + os.kill(bucket_process.pid, signal.SIGKILL) + bucket_process.join() + + +if __name__ == "__main__": + main() diff --git a/e2e2/test/images/neuron/common/start_with_right_hostname.sh b/e2e2/test/images/neuron/common/start_with_right_hostname.sh new file mode 100644 index 000000000..63c4fee89 --- /dev/null +++ b/e2e2/test/images/neuron/common/start_with_right_hostname.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +if [[ "$1" = "train" ]]; then + CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json) + sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c + gcc -o changehostname.o -c -fPIC -Wall changehostname.c + gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl + LD_PRELOAD=/libchangehostname.so train +else + eval "$@" +fi diff --git a/e2e2/test/images/neuron/tests/singleNodeTest.sh b/e2e2/test/images/neuron/tests/singleNodeTest.sh new file mode 100755 index 000000000..9828efa74 --- /dev/null +++ b/e2e2/test/images/neuron/tests/singleNodeTest.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronSingleAllReduce.py +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronParallelState.py +torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronMlp.py \ No newline at end of file diff --git a/e2e2/test/images/pytorch_tests/testNeuronMlp.py b/e2e2/test/images/neuron/tests/testNeuronMlp.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronMlp.py rename to e2e2/test/images/neuron/tests/testNeuronMlp.py diff --git a/e2e2/test/images/pytorch_tests/testNeuronParallelState.py b/e2e2/test/images/neuron/tests/testNeuronParallelState.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronParallelState.py rename to e2e2/test/images/neuron/tests/testNeuronParallelState.py diff --git a/e2e2/test/images/pytorch_tests/testNeuronSingleAllReduce.py b/e2e2/test/images/neuron/tests/testNeuronSingleAllReduce.py similarity index 100% rename from e2e2/test/images/pytorch_tests/testNeuronSingleAllReduce.py rename to e2e2/test/images/neuron/tests/testNeuronSingleAllReduce.py diff --git a/e2e2/test/images/pytorch_tests/singleNodeTest.sh b/e2e2/test/images/pytorch_tests/singleNodeTest.sh deleted file mode 100755 index ba4e77da1..000000000 --- a/e2e2/test/images/pytorch_tests/singleNodeTest.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronSingleAllReduce.py -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronParallelState.py -torchrun --nproc_per_node=2 --nnodes=1 pytorch_tests/testNeuronMlp.py \ No newline at end of file