diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index f2fc0db1..64d12c61 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -85,9 +85,10 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin ENV SIGOPT_PROJECT=. WORKDIR / -COPY multinode-requirements.txt . +COPY multinode/requirements.txt requirements.txt -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" @@ -99,16 +100,11 @@ RUN apt-get install -y --no-install-recommends --fix-missing \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Allow OpenSSH to talk to containers without asking for confirmation -# hadolint global ignore=SC2002 -RUN mkdir -p /var/run/sshd && \ - cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ - mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config +RUN mkdir -p /var/run/sshd ARG PYTHON_VERSION -COPY generate_ssh_keys.sh . +COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh # modify generate_ssh_keys to be a helper script # print how to use helper script on bash startup @@ -117,26 +113,9 @@ RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bin cat '/generate_ssh_keys.sh' >> ~/.startup && \ rm -rf /generate_ssh_keys.sh -# hadolint global ignore=SC3037 -RUN echo -e "#!/bin/bash \n\ -set -e \n\ -set -a \n\ -source ~/.startup \n\ -set +a \n\ -eval \"\$@\"" >> /usr/local/bin/dockerd-entrypoint.sh && \ - chmod +x /usr/local/bin/dockerd-entrypoint.sh - -RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \ - echo 'HostKey /etc/ssh/ssh_host_rsa_key' > /var/run/sshd_config && \ - echo 'HostKey /etc/ssh/ssh_host_ecdsa_key' > /var/run/sshd_config && \ - echo 'HostKey /etc/ssh/ssh_host_ed25519_key' > /var/run/sshd_config && \ - echo 'AuthorizedKeysFile /etc/ssh/authorized_keys' > /var/run/sshd_config && \ - echo '## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time' > /var/run/sshd_config && \ - echo 'LogLevel DEBUG3' > /var/run/sshd_config && \ - echo 'UsePAM yes' > /var/run/sshd_config && \ - echo 'LoginGraceTime 0' >> /var/run/sshd_config && \ - echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config && \ - echo 'Subsystem sftp /usr/lib/openssh/sftp-server' > /var/run/sshd_config +COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY multinode/sshd_config /etc/ssh/sshd_config +COPY multinode/ssh_config /etc/ssh/ssh_config RUN mkdir -p /licensing diff --git a/pytorch/README.md b/pytorch/README.md index 198c8c05..b1778cf5 100644 --- a/pytorch/README.md +++ b/pytorch/README.md @@ -114,12 +114,8 @@ The images below additionally include [IntelĀ® oneAPI Collective Communications | `2.0.0-pip-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] | > **Note:** Passwordless SSH connection is also enabled in the image. -> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/root/.ssh/id_rsa.pub`. -> User also need to append content of id_rsa.pub in `/etc/ssh/authorized_keys` in the SSH server container. -> Since the SSH key is not owned by default user account in docker, please also do "chmod 644 id_rsa.pub; chmod 644 id_rsa" to grant read access for default user account. -> Users could also use "/usr/bin/ssh-keygen -t rsa -b 4096 -N '' -f ~/mnt/ssh_key/id_rsa" to generate a new SSH Key inside the container. -> Users need to mount a config file to list all hostnames at location `/root/.ssh/config` on the SSH client container. -> Once all files are added +> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. +> Since the SSH key is not owned by default user account in docker, please also do "chmod 600 authorized_keys; chmod 600 id_rsa" to grant read access for default user account. #### Setup and Run IPEX Multi-Node Container @@ -131,8 +127,7 @@ SSH Server (Worker) SSH Client (Launcher) -1. *Config File with Host IPs* : `/root/.ssh/config` -2. *Private User Key* : `/root/.ssh/id_rsa` +1. *Private User Key* : `/root/.ssh/id_rsa` To add these files correctly please follow the steps described below. @@ -146,47 +141,33 @@ To add these files correctly please follow the steps described below. cat id_rsa.pub >> authorized_keys ``` -2. Add hosts to config - - The launcher container needs to have the a config file with all hostnames and ports specified. An example of a hostfile is provided below. +2. Configure the permissions and ownership for all of the files you have created so far. ```bash - touch config + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys ``` +3. Setup hostfile. The hostfile is needed for running torch distributed using `ipexrun` utility. If you're not using `ipexrun` you can skip this step. + ```txt - Host host1 - HostName - IdentitiesOnly yes - Port - Host host2 - HostName - IdentitiesOnly yes - Port + + ... ``` -3. Configure the permissions and ownership for all of the files you have created so far. - - ```bash - chmod 600 id_rsa.pub id_rsa config authorized_keys - chown root:root id_rsa.pub id_rsa config authorized_keys - ``` - 4. Now start the workers and execute DDP on the launcher. 1. Worker run command: ```bash - export SSH_PORT= docker run -it --rm \ --net=host \ - -v $PWD/authorized_keys:/root/.ssh/authorized_keys \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ -v $PWD/tests:/workspace/tests \ -w /workspace \ - -e SSH_PORT=${SSH_PORT} \ intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ - bash -c '/usr/sbin/sshd -D -p ${SSH_PORT} -f /var/run/sshd_config' + bash -c '/usr/sbin/sshd -D' ``` 2. Launcher run command: @@ -195,12 +176,65 @@ To add these files correctly please follow the steps described below. docker run -it --rm \ --net=host \ -v $PWD/id_rsa:/root/.ssh/id_rsa \ - -v $PWD/config:/root/.ssh/config \ -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/workspace/hostfile \ -w /workspace \ + intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port 3022 /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' + ``` + +5. Start SSH server with a custom port. + If the user wants to define their own port to start the SSH server, it can be done so using the commands described below. + + 1. Worker command: + + ```bash + export SSH_PORT= + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ -e SSH_PORT=${SSH_PORT} \ + -w /workspace \ + intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + bash -c '/usr/sbin/sshd -D -p ${SSH_PORT}' + ``` + + 2. Add hosts to config. (**Note:** This is an optional step) + + User can optionally mount their own custom client config file to define a list of hosts and ports where the SSH server is running inside the container. An example of a hostfile is provided below. This file is supposed to be mounted in the launcher container at `/etc/ssh/ssh_config`. + + ```bash + touch config + ``` + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + + 3. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/config:/etc/ssh/ssh_config \ + -v $PWD/hostfile:/workspace/hostfile \ + -v $PWD/tests:/workspace/tests \ + -e SSH_PORT=${SSH_PORT} \ + -w /workspace \ intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ - bash -c 'ipexrun cpu /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' + bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port ${SSH_PORT} /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' ``` > [!NOTE] diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index f18afdf8..b42f6d84 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -77,7 +77,7 @@ services: dependency.apt.libglib2: true dependency.apt.python3-dev: true dependency.pip.apt.virtualenv: true - dependency.python.pip: multinode-requirements.txt + dependency.python.pip: multinode/requirements.txt org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "IntelĀ® Extension for PyTorch MultiNode Image" org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode diff --git a/pytorch/multinode/dockerd-entrypoint.sh b/pytorch/multinode/dockerd-entrypoint.sh new file mode 100755 index 00000000..ba13c0f9 --- /dev/null +++ b/pytorch/multinode/dockerd-entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -a +# shellcheck disable=SC1091 +source "$HOME/.startup" +set +a +"$@" diff --git a/pytorch/generate_ssh_keys.sh b/pytorch/multinode/generate_ssh_keys.sh similarity index 100% rename from pytorch/generate_ssh_keys.sh rename to pytorch/multinode/generate_ssh_keys.sh diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode/requirements.txt similarity index 100% rename from pytorch/multinode-requirements.txt rename to pytorch/multinode/requirements.txt diff --git a/pytorch/multinode/ssh_config b/pytorch/multinode/ssh_config new file mode 100644 index 00000000..9ac73017 --- /dev/null +++ b/pytorch/multinode/ssh_config @@ -0,0 +1,4 @@ +Host * + Port 3022 + IdentityFile ~/.ssh/id_rsa + StrictHostKeyChecking no diff --git a/pytorch/multinode/sshd_config b/pytorch/multinode/sshd_config new file mode 100644 index 00000000..4796a48a --- /dev/null +++ b/pytorch/multinode/sshd_config @@ -0,0 +1,12 @@ +HostKey /etc/ssh/ssh_host_dsa_key +HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +AuthorizedKeysFile /etc/ssh/authorized_keys +## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time +LogLevel DEBUG3 +Port 3022 +UsePAM yes +Subsystem sftp /usr/lib/openssh/sftp-server +# https://ubuntu.com/security/CVE-2024-6387 +LoginGraceTime 0