From 72ff98b52b5b2d407c96f988fd65ee50d23f456f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 13 Jun 2024 16:12:38 -0700 Subject: [PATCH 1/6] optimize GPU image --- Dockerfile_k8s_gpu | 59 ++++++++++++++-------------- sky/provision/kubernetes/instance.py | 7 +++- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index f570181d8e7..7a8edf47151 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -1,46 +1,51 @@ -# TODO(romilb) - The base image used here (ray) is very large (11.4GB). -# as a result, this built image is about 13.5GB. We need to pick a lighter base -# image. -FROM rayproject/ray:2.9.3-py310-gpu +FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 -# Initialize conda for root user, install ssh and other local dependencies +# Install ssh and other local dependencies # We remove cuda lists to avoid conflicts with the cuda version installed by ray -RUN sudo rm -rf /etc/apt/sources.list.d/cuda* && \ - sudo apt update -y && \ - sudo apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ - sudo rm -rf /var/lib/apt/lists/* && \ - sudo apt remove -y python3 && \ - conda init +RUN rm -rf /etc/apt/sources.list.d/cuda* && \ + apt update -y && \ + apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ + rm -rf /var/lib/apt/lists/* # Setup new user named sky and add to sudoers. \ -# Also add /opt/conda/bin to sudo path and give sky user access to /home/ray +# Also add /opt/conda/bin to sudo path and give sky user permission to run sudo without password RUN sudo useradd -m -s /bin/bash sky && \ sudo /bin/bash -c 'echo "sky ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' && \ - sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" && \ - sudo chmod -R a+rwx /home/ray + sudo /bin/bash -c "echo 'Defaults secure_path=\"/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"' > /etc/sudoers.d/sky" # Switch to sky user USER sky -# Set HOME environment variable for sky user, otherwise Ray base image HOME overrides +# Set HOME environment variable for sky user ENV HOME /home/sky +# Set current working directory +WORKDIR /home/sky + +SHELL ["/bin/bash", "-c"] + +# Install conda and other dependencies +# Keep the Ray version below in sync with skylet.constants.SKY_REMOTE_RAY_VERSION +RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \ + bash Miniconda3-Linux-x86_64.sh -b && \ + eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \ + grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \ + rm Miniconda3-Linux-x86_64.sh && \ + pip install wheel Click colorama cryptography jinja2 jsonschema networkx \ + oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \ + 'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \ + grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \ + curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + + # Setup SSH and generate hostkeys RUN sudo mkdir -p /var/run/sshd && \ sudo chmod 0755 /var/run/sshd && \ sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ cd /etc/ssh/ && \ - ssh-keygen -A - -# Install SkyPilot pip dependencies -RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \ - pip install networkx oauth2client pandas pendulum PrettyTable && \ - pip install rich tabulate filelock && \ - pip install packaging 'protobuf<4.0.0' pulp && \ - pip install pycryptodome==3.12.0 && \ - pip install docker kubernetes==28.1.0 && \ - pip install grpcio==1.51.3 python-dotenv==1.0.1 + sudo ssh-keygen -A # Add /home/sky/.local/bin/ to PATH RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc @@ -51,7 +56,3 @@ COPY --chown=sky . /skypilot/sky/ # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 - -# Set WORKDIR and initialize conda for sky user -WORKDIR /home/sky -RUN conda init diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index a0727b26a5b..8c226e5fcda 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -329,8 +329,11 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None: 'prefix_cmd() ' '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; ' 'export DEBIAN_FRONTEND=noninteractive;' - '$(prefix_cmd) apt-get update;' - '$(prefix_cmd) apt install openssh-server rsync -y; ' + # Check and install openssh-server and rsync only if either is missing + 'if ! $(prefix_cmd) dpkg -l | grep -qw openssh-server || ! $(prefix_cmd) dpkg -l | grep -qw rsync; then ' # pylint: disable=line-too-long + ' $(prefix_cmd) apt-get update;' + ' $(prefix_cmd) apt install openssh-server rsync -y; ' + 'fi; ' '$(prefix_cmd) mkdir -p /var/run/sshd; ' '$(prefix_cmd) ' 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" ' From 3500165241295e5d205e1693bf1b9c08448743f6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 13 Jun 2024 17:02:00 -0700 Subject: [PATCH 2/6] revert changes --- sky/provision/kubernetes/instance.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 8c226e5fcda..a0727b26a5b 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -329,11 +329,8 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None: 'prefix_cmd() ' '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; ' 'export DEBIAN_FRONTEND=noninteractive;' - # Check and install openssh-server and rsync only if either is missing - 'if ! $(prefix_cmd) dpkg -l | grep -qw openssh-server || ! $(prefix_cmd) dpkg -l | grep -qw rsync; then ' # pylint: disable=line-too-long - ' $(prefix_cmd) apt-get update;' - ' $(prefix_cmd) apt install openssh-server rsync -y; ' - 'fi; ' + '$(prefix_cmd) apt-get update;' + '$(prefix_cmd) apt install openssh-server rsync -y; ' '$(prefix_cmd) mkdir -p /var/run/sshd; ' '$(prefix_cmd) ' 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" ' From 78e23f16b81a2a3f8e12ef28b0326147eeab4a89 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 13 Jun 2024 17:04:18 -0700 Subject: [PATCH 3/6] add kubectl to cpu image --- Dockerfile_k8s | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile_k8s b/Dockerfile_k8s index 7b311dde13f..043f7fe8476 100644 --- a/Dockerfile_k8s +++ b/Dockerfile_k8s @@ -32,7 +32,9 @@ RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \ pip install packaging 'protobuf<4.0.0' pulp && \ pip install pycryptodome==3.12.0 && \ pip install docker kubernetes==28.1.0 && \ - pip install grpcio==1.51.3 python-dotenv==1.0.1 + pip install grpcio==1.51.3 python-dotenv==1.0.1 && \ + curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl # Add /home/sky/.local/bin/ to PATH RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc From d6ef607419c0a7d6b90d30490ecfe2ce83079749 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 13 Jun 2024 23:18:53 -0700 Subject: [PATCH 4/6] add git to dependencies --- Dockerfile_k8s | 2 +- Dockerfile_k8s_gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile_k8s b/Dockerfile_k8s index 043f7fe8476..a5b9d34bb04 100644 --- a/Dockerfile_k8s +++ b/Dockerfile_k8s @@ -5,7 +5,7 @@ FROM continuumio/miniconda3:23.3.1-0 # Initialize conda for root user, install ssh and other local dependencies RUN apt update -y && \ - apt install gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \ + apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \ rm -rf /var/lib/apt/lists/* && \ apt remove -y python3 && \ conda init diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index 7a8edf47151..42bfe1cf618 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -4,7 +4,7 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 # We remove cuda lists to avoid conflicts with the cuda version installed by ray RUN rm -rf /etc/apt/sources.list.d/cuda* && \ apt update -y && \ - apt install gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ + apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ rm -rf /var/lib/apt/lists/* # Setup new user named sky and add to sudoers. \ From c0c55f4e5496fb14e367358d0991fa9892c429c0 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 14 Jun 2024 11:36:06 -0700 Subject: [PATCH 5/6] fixes and parity b/w cpu and gpu images --- Dockerfile_k8s | 24 +++++++++++++----------- Dockerfile_k8s_gpu | 20 ++++++++++---------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/Dockerfile_k8s b/Dockerfile_k8s index a5b9d34bb04..63def8682b2 100644 --- a/Dockerfile_k8s +++ b/Dockerfile_k8s @@ -3,6 +3,8 @@ FROM continuumio/miniconda3:23.3.1-0 # TODO(romilb): Investigate if this image can be consolidated with the skypilot # client image (`Dockerfile`) +ARG DEBIAN_FRONTEND=noninteractive + # Initialize conda for root user, install ssh and other local dependencies RUN apt update -y && \ apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \ @@ -25,14 +27,18 @@ RUN useradd -m -s /bin/bash sky && \ # Switch to sky user USER sky +# Set HOME environment variable for sky user +ENV HOME /home/sky + +# Set current working directory +WORKDIR /home/sky + # Install SkyPilot pip dependencies preemptively to speed up provisioning time -RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \ - pip install networkx oauth2client pandas pendulum PrettyTable && \ - pip install ray[default]==2.9.3 rich tabulate filelock && \ - pip install packaging 'protobuf<4.0.0' pulp && \ - pip install pycryptodome==3.12.0 && \ - pip install docker kubernetes==28.1.0 && \ - pip install grpcio==1.51.3 python-dotenv==1.0.1 && \ +RUN conda init && \ + pip install wheel Click colorama cryptography jinja2 jsonschema networkx \ + oauth2client pandas pendulum PrettyTable rich tabulate filelock packaging \ + 'protobuf<4.0.0' pulp pycryptodome==3.12.0 docker kubernetes==28.1.0 \ + grpcio==1.51.3 python-dotenv==1.0.1 ray[default]==2.9.3 && \ curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl @@ -45,7 +51,3 @@ COPY --chown=sky . /skypilot/sky/ # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 - -# Set WORKDIR and initialize conda for sky user -WORKDIR /home/sky -RUN conda init diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index 42bfe1cf618..9e56e321e4e 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -1,4 +1,6 @@ -FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 +FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 + +ARG DEBIAN_FRONTEND=noninteractive # Install ssh and other local dependencies # We remove cuda lists to avoid conflicts with the cuda version installed by ray @@ -7,6 +9,13 @@ RUN rm -rf /etc/apt/sources.list.d/cuda* && \ apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \ rm -rf /var/lib/apt/lists/* +# Setup SSH and generate hostkeys +RUN sudo mkdir -p /var/run/sshd && \ + sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ + cd /etc/ssh/ && \ + sudo ssh-keygen -A + # Setup new user named sky and add to sudoers. \ # Also add /opt/conda/bin to sudo path and give sky user permission to run sudo without password RUN sudo useradd -m -s /bin/bash sky && \ @@ -38,15 +47,6 @@ RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x8 curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \ sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl - -# Setup SSH and generate hostkeys -RUN sudo mkdir -p /var/run/sshd && \ - sudo chmod 0755 /var/run/sshd && \ - sudo sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - sudo sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ - cd /etc/ssh/ && \ - sudo ssh-keygen -A - # Add /home/sky/.local/bin/ to PATH RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc From 24e442f9edbb639276b455bda17c3f3646565c43 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 18 Jun 2024 15:17:39 -0700 Subject: [PATCH 6/6] comments --- Dockerfile_k8s_gpu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index 9e56e321e4e..f9bc7258c61 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -1,3 +1,4 @@ +# We use the cuda runtime image instead of devel image to reduce size (1.3GB vs 3.6GB) FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 ARG DEBIAN_FRONTEND=noninteractive @@ -34,7 +35,7 @@ WORKDIR /home/sky SHELL ["/bin/bash", "-c"] # Install conda and other dependencies -# Keep the Ray version below in sync with skylet.constants.SKY_REMOTE_RAY_VERSION +# Keep the conda and Ray versions below in sync with the ones in skylet.constants RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \ bash Miniconda3-Linux-x86_64.sh -b && \ eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \