Skip to content

Commit

Permalink
feat: NGC+ Image Template (#235)
Browse files Browse the repository at this point in the history
* add templates for NGC+ images

* add image matrix

* backport lots of improvements to scripts

* remove tf2.8 images
  • Loading branch information
MikhailKardash authored Mar 7, 2024
1 parent 2196775 commit 03ae7d7
Show file tree
Hide file tree
Showing 17 changed files with 285 additions and 197 deletions.
34 changes: 20 additions & 14 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,20 +188,21 @@ workflows:
with-mpi: [0, 1]
image-type:
- tf2-cpu
- tf28-cpu
- pt-cpu
- pt2-cpu
- tf2-gpu
- tf28-gpu
- pt-gpu
- pt2-gpu
- tensorflow-ngc
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
exclude:
- with-mpi: 1
image-type:
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
image-type: pytorch13-tf210-rocm56
- with-mpi: 1
image-type: pytorch20-tf210-rocm56
- with-mpi: 1
image-type: tensorflow-ngc
- build-and-publish-docker:
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>
context: determined-production
Expand All @@ -219,8 +220,9 @@ workflows:
- gpu.nvidia.small.multi
with-mpi: [0]
image-type:
- deepspeed-gpu
- gpt-neox-deepspeed-gpu
- deepspeed
- gpt-neox-deepspeed
- pytorch-ngc
- publish-cloud-images:
context: determined-production
filters:
Expand Down Expand Up @@ -253,21 +255,24 @@ workflows:
with-mpi: [0, 1]
image-type:
- tf2-cpu
- tf28-cpu
- pt-cpu
- pt2-cpu
- tf2-gpu
- tf28-gpu
- pt-gpu
- pt2-gpu
- tensorflow-ngc
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
exclude:
- dev-mode: true
with-mpi: 1
image-type:
- pytorch13-tf210-rocm56
- pytorch20-tf210-rocm56
image-type: pytorch13-tf210-rocm56
- dev-mode: true
with-mpi: 1
image-type: pytorch20-tf210-rocm56
- dev-mode: true
with-mpi: 1
image-type: tensorflow-ngc

- build-and-publish-docker:
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
Expand All @@ -287,8 +292,9 @@ workflows:
- gpu.nvidia.small.multi
with-mpi: [0]
image-type:
- deepspeed-gpu
- gpt-neox-deepspeed-gpu
- deepspeed
- gpt-neox-deepspeed
- pytorch-ngc

- publish-cloud-images:
name: publish-cloud-images-dev
Expand Down
37 changes: 2 additions & 35 deletions Dockerfile-base-cpu
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,10 @@ ARG UBUNTU_VERSION
RUN rm -f /etc/apt/sources.list.d/*
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PIP_NO_CACHE_DIR=1

RUN mkdir -p /var/run/sshd
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
daemontools \
libkrb5-dev \
libssl-dev \
libtool \
git \
krb5-user \
g++ \
cmake \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
nfs-common \
libnuma1 \
libnuma-dev \
libpmi2-0-dev \
unattended-upgrades \
&& unattended-upgrade \
&& rm -rf /var/lib/apt/lists/* \
&& rm /etc/ssh/ssh_host_ecdsa_key \
&& rm /etc/ssh/ssh_host_ed25519_key \
&& rm /etc/ssh/ssh_host_rsa_key

COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh

ENV PATH="/opt/conda/bin:${PATH}"
ARG CONDA="${PATH}"
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
Expand Down Expand Up @@ -84,8 +53,6 @@ ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}



# We uninstall these packages after installing. This ensures that we can
# successfully install these packages into containers running as non-root.
# `pip` does not uninstall dependencies, so we still have all the dependencies
Expand Down
38 changes: 2 additions & 36 deletions Dockerfile-base-gpu
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,11 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PIP_NO_CACHE_DIR=1
# We need to create sym links for the Slurm PMI headers if we are using
# Ubuntu 18.04 because they are not installed in a standard location.
ARG UBUNTU_VERSION
RUN mkdir -p /var/run/sshd
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
daemontools \
ibverbs-providers \
libibverbs1 \
libkrb5-dev \
librdmacm1 \
libssl-dev \
libtool \
git \
krb5-user \
g++ \
cmake \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
nfs-common \
libnuma1 \
libnuma-dev \
libpmi2-0-dev \
unattended-upgrades \
&& unattended-upgrade \
&& rm -rf /var/lib/apt/lists/* \
&& rm /etc/ssh/ssh_host_ecdsa_key \
&& rm /etc/ssh/ssh_host_ed25519_key \
&& rm /etc/ssh/ssh_host_rsa_key \
&& if [ "$UBUNTU_VERSION" = "ubuntu18.04" ]; then ln -s /usr/include/slurm-wlm /usr/include/slurm; fi

COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh

ARG WITH_NCCL
# Install debuild util, etc. for later compiling GDRcopy libraries
RUN if [ "$WITH_NCCL" = "1" ]; then apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y devscripts debhelper; fi
Expand Down
14 changes: 5 additions & 9 deletions Dockerfile-default-cpu
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@ ARG TENSORFLOW_PIP
ARG TORCH_PIP
ARG TORCHVISION_PIP
RUN if [ "$TENSORFLOW_PIP" ]; then \
export HOROVOD_WITH_TENSORFLOW=1 && \
export HOROVOD_WITH_TENSORFLOW=1 && \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
TENSORFLOW_AARCH64_PIP=$(printf '%s' "$TENSORFLOW_PIP" | sed 's/cpu/aarch64/') \
&& pip install $TENSORFLOW_AARCH64_PIP; \
else \
pip install $TENSORFLOW_PIP; \
fi; \
else \
export HOROVOD_WITH_TENSORFLOW=0; \
pip install -r /tmp/det_dockerfile_scripts/additional-requirements-tf.txt; \
else \
export HOROVOD_WITH_TENSORFLOW=0; \
fi
RUN if [ "$TORCH_PIP" ]; then \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
Expand All @@ -27,15 +28,10 @@ RUN if [ "$TORCH_PIP" ]; then \
else \
pip install $TORCH_PIP; \
fi; \
pip install -r /tmp/det_dockerfile_scripts/additional-requirements-torch.txt; \
fi
RUN if [ "$TORCHVISION_PIP" ]; then pip install $TORCHVISION_PIP; fi

ARG TORCH_TB_PROFILER_PIP
RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then pip install $TORCH_TB_PROFILER_PIP; fi

ARG TF_PROFILER_PIP
RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi

ARG HOROVOD_WITH_TENSORFLOW
RUN if [ "$HOROVOD_WITH_TENSORFLOW" ]; then export HOROVOD_WITH_TENSORFLOW=$HOROVOD_WITH_TENSORFLOW; fi

Expand Down
22 changes: 10 additions & 12 deletions Dockerfile-default-gpu
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,21 @@ ARG TORCH_PIP
ARG TORCHVISION_PIP

RUN if [ "$TENSORFLOW_PIP" ]; then \
export HOROVOD_WITH_TENSORFLOW=1 \
&& python -m pip install $TENSORFLOW_PIP; \
else \
export HOROVOD_WITH_TENSORFLOW=0; \
fi
RUN if [ "$TORCH_PIP" ]; then python -m pip install $TORCH_PIP; fi
export HOROVOD_WITH_TENSORFLOW=1 \
&& python -m pip install $TENSORFLOW_PIP \
&& python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-tf.txt; \
else \
export HOROVOD_WITH_TENSORFLOW=0; \
fi
RUN if [ "$TORCH_PIP" ]; then \
python -m pip install $TORCH_PIP \
&& python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-torch.txt; \
fi
RUN if [ "$TORCHVISION_PIP" ]; then python -m pip install $TORCHVISION_PIP; fi

ARG TF_CUDA_SYM
RUN if [ "$TF_CUDA_SYM" ]; then ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/lib/python3.8/site-packages/tensorflow/python/libcusolver.so.10; fi

ARG TORCH_TB_PROFILER_PIP
RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then python -m pip install $TORCH_TB_PROFILER_PIP; fi

ARG TF_PROFILER_PIP
RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi

ARG TORCH_CUDA_ARCH_LIST
ARG APEX_GIT
RUN /tmp/det_dockerfile_scripts/install_apex.sh
Expand Down
24 changes: 24 additions & 0 deletions Dockerfile-ngc-hpc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# Copy various shell scripts that group dependencies for install
COPY dockerfile_scripts /tmp/det_dockerfile_scripts

ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws.sh "$WITH_OFI" "$WITH_AWS_TRACE"; fi

#USING OFI
ARG AWS_LIB_DIR=${AWS_PLUGIN_INSTALL_DIR}/lib
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_LIB_DIR:}$LD_LIBRARY_PATH

# Set an entrypoint that can scrape up the host libfabric.so and then
# run the user command. This is intended to enable performant execution
# on non-IB systems that have a proprietary libfabric.
RUN mkdir -p /container/bin && \
cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
ENTRYPOINT ["/container/bin/scrape_libs.sh"]

RUN rm -r /tmp/*
34 changes: 34 additions & 0 deletions Dockerfile-pytorch-ngc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# NGC images contain user owned files in /usr/lib
RUN chown root:root /usr/lib

# Copy various shell scripts that group dependencies for install
COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh

# We uninstall these packages after installing. This ensures that we can
# successfully install these packages into containers running as non-root.
# `pip` does not uninstall dependencies, so we still have all the dependencies
# installed.
RUN python -m pip install determined && python -m pip uninstall -y determined

RUN python -m pip install \
-r /tmp/det_dockerfile_scripts/additional-requirements-torch.txt \
-r /tmp/det_dockerfile_scripts/additional-requirements.txt \
-r /tmp/det_dockerfile_scripts/notebook-requirements.txt

ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime

RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh

ENV DEEPSPEED_PIP="deepspeed==0.13.0"
RUN /tmp/det_dockerfile_scripts/install_deepspeed.sh

RUN rm -r /tmp/*
31 changes: 31 additions & 0 deletions Dockerfile-tensorflow-ngc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# NGC images contain user owned files in /usr/lib
RUN chown root:root /usr/lib

# Copy various shell scripts that group dependencies for install
COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh

# We uninstall these packages after installing. This ensures that we can
# successfully install these packages into containers running as non-root.
# `pip` does not uninstall dependencies, so we still have all the dependencies
# installed.
RUN python -m pip install determined && python -m pip uninstall -y determined

RUN python -m pip install \
-r /tmp/det_dockerfile_scripts/additional-requirements-tf.txt \
-r /tmp/det_dockerfile_scripts/additional-requirements.txt \
-r /tmp/det_dockerfile_scripts/notebook-requirements.txt

ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime

RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh

RUN rm -r /tmp/*
Loading

0 comments on commit 03ae7d7

Please sign in to comment.