diff --git a/.github/workflows/build_and_push_to_docker.yml b/.github/workflows/build_and_push_to_docker.yml index 4981c3e3..710fef20 100644 --- a/.github/workflows/build_and_push_to_docker.yml +++ b/.github/workflows/build_and_push_to_docker.yml @@ -107,9 +107,41 @@ jobs: include: # taken from tensorflow compatibility chart at https://www.tensorflow.org/install/source#gpu - CUDA_VERSION: "11.0" - BASE_IMAGE: renku/renkulab-py:python-3.8.8 + PYTHON_VERSION: "3.8.8" + EXTRA_LIBRARIES: "libcusolver-11-2" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-0=11.0.221-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-0" + LIBCUDNN_PACKAGE: "libcudnn8=8.0.5.39-1+cuda11.0" + - CUDA_VERSION: "11.1" + PYTHON_VERSION: "3.8.8" + EXTRA_LIBRARIES: "libcusolver-11-2" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-1=11.1.74-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-1" + LIBCUDNN_PACKAGE: "libcudnn8=8.0.5.39-1+cuda11.1" - CUDA_VERSION: "11.2" - BASE_IMAGE: renku/renkulab-py:python-3.9.7 + PYTHON_VERSION: "3.9.7" + EXTRA_LIBRARIES: "" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-2=11.2.152-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-2" + LIBCUDNN_PACKAGE: "libcudnn8=8.1.1.33-1+cuda11.2" + - CUDA_VERSION: "11.3" + PYTHON_VERSION: "3.9.7" + EXTRA_LIBRARIES: "" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-3=11.3.109-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-3" + LIBCUDNN_PACKAGE: "libcudnn8=8.2.1.32-1+cuda11.3" + - CUDA_VERSION: "11.4" + PYTHON_VERSION: "3.9.7" + EXTRA_LIBRARIES: "" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-4=11.4.148-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-4" + LIBCUDNN_PACKAGE: "libcudnn8=8.2.4.15-1+cuda11.4" + - CUDA_VERSION: "11.5" + PYTHON_VERSION: "3.9.7" + EXTRA_LIBRARIES: "" + CUDA_CUDART_PACKAGE: "cuda-cudart-11-5=11.5.117-1" + CUDA_COMPAT_PACKAGE: "cuda-compat-11-5" + LIBCUDNN_PACKAGE: "libcudnn8=8.3.2.44-1+cuda11.5" steps: - name: Docker Login @@ -130,8 +162,13 @@ jobs: export LABEL=$(echo ${{ github.sha }} | cut -c 1-7) fi - docker build -f Dockerfile-${{ matrix.CUDA_VERSION }} . \ - --build-arg BASE_IMAGE="${{ matrix.BASE_IMAGE }}-$LABEL" \ + docker build . \ + --build-arg BASE_IMAGE="renku/renkulab-py:python-${{ matrix.PYTHON_VERSION }}-$LABEL" \ + --build-arg CUDA_VERSION="${{ matrix.CUDA_VERSION }}" \ + --build-arg EXTRA_LIBRARIES="${{ matrix.EXTRA_LIBRARIES }}" \ + --build-arg CUDA_CUDART_PACKAGE="${{ matrix.CUDA_CUDART_PACKAGE }}" \ + --build-arg CUDA_COMPAT_PACKAGE="${{ matrix.CUDA_COMPAT_PACKAGE }}" \ + --build-arg LIBCUDNN_PACKAGE="${{ matrix.LIBCUDNN_PACKAGE }}" \ --tag $DOCKER_NAME-cuda:${{ matrix.CUDA_VERSION }}-$LABEL echo "::set-output name=IMAGE_NAME::$DOCKER_NAME-cuda:${{ matrix.CUDA_VERSION }}-$LABEL" - name: Image Acceptance Tests @@ -147,7 +184,7 @@ jobs: docker push ${{ steps.build-image.outputs.IMAGE_NAME }} # on master push latest image - if [ "$REF" == "refs/heads/master" ] && [ "${{ matrix.CUDA_VERSION }}" == "11.2" ]; then + if [ "$REF" == "refs/heads/master" ] && [ "${{ matrix.CUDA_VERSION }}" == "11.5" ]; then docker tag ${{ steps.build-image.outputs.IMAGE_NAME }} $DOCKER_NAME-cuda:latest docker push $DOCKER_NAME-cuda:latest fi diff --git a/docker/cuda/Dockerfile-11.0 b/docker/cuda/Dockerfile similarity index 87% rename from docker/cuda/Dockerfile-11.0 rename to docker/cuda/Dockerfile index 4ff5c8af..6a96c90b 100644 --- a/docker/cuda/Dockerfile-11.0 +++ b/docker/cuda/Dockerfile @@ -13,13 +13,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get purge --autoremove -y curl \ && rm -rf /var/lib/apt/lists/* -ENV CUDA_VERSION 11.0.3 +ARG CUDA_VERSION=11.2 +ENV CUDA_VERSION $CUDA_VERSION +ARG CUDA_CUDART_PACKAGE=cuda-cudart-11-2=11.2.152-1 +ARG CUDA_COMPAT_PACKAGE=cuda-compat-11-2 +ARG EXTRA_LIBRARIES="" # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-11-0=11.0.221-1 \ - cuda-compat-11-0 \ - && ln -s cuda-11.0 /usr/local/cuda && \ + $CUDA_CUDART_PACKAGE \ + $CUDA_COMPAT_PACKAGE \ + && ln -s cuda-${CUDA_VERSION} /usr/local/cuda && \ rm -rf /var/lib/apt/lists/* # Required for nvidia-docker v1 @@ -38,10 +42,9 @@ ENV NVIDIA_REQUIRE_CUDA "cuda>=11.0 brand=tesla,driver>=418,driver<419 brand=tes # Install TensorFlow (from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile) # -ARG CUDA=11.0 -ARG CUDNN=8.0.5.39 -ARG CUDNN_MAJOR_VERSION=8 +ARG CUDA=${CUDA_VERSION} ARG LIB_DIR_PREFIX=x86_64 +ARG LIBCUDNN_PACKAGE="libcudnn8=8.1.1.33-1+cuda11.2" # Needed for string substitution SHELL ["/bin/bash", "-c"] @@ -54,10 +57,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libcufft-${CUDA/./-} \ libcurand-${CUDA/./-} \ libcusolver-${CUDA/./-} \ - libcusolver-11-2 \ libcusparse-${CUDA/./-} \ + $EXTRA_LIBRARIES \ curl \ - libcudnn8=${CUDNN}-1+cuda${CUDA} \ + ${LIBCUDNN_PACKAGE} \ libfreetype6-dev \ libhdf5-serial-dev \ libzmq3-dev \ diff --git a/docker/cuda/Dockerfile-11.2 b/docker/cuda/Dockerfile-11.2 deleted file mode 100644 index 2e2f8a87..00000000 --- a/docker/cuda/Dockerfile-11.2 +++ /dev/null @@ -1,84 +0,0 @@ -ARG BASE_IMAGE=renku/renkulab-py:latest -FROM $BASE_IMAGE - -LABEL maintainer="Swiss Data Science Center " - -USER root -# Install the CUDA "base" -RUN apt-get update && apt-get install -y --no-install-recommends \ - gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --autoremove -y curl \ - && rm -rf /var/lib/apt/lists/* - -ENV CUDA_VERSION 11.2 - -# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-11-2=11.2.152-1 \ - cuda-compat-11-2 \ - && ln -s cuda-11.2 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# Required for nvidia-docker v1 -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ - && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=11.0 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 brand=tesla,driver>=450,driver<451" - -# -# Install TensorFlow (from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile) -# - -ARG CUDA=11.2 -ARG CUDNN=8.1.1.33 -ARG CUDNN_MAJOR_VERSION=8 -ARG LIB_DIR_PREFIX=x86_64 - -# Needed for string substitution -SHELL ["/bin/bash", "-c"] -# Pick up some TF dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cuda-command-line-tools-${CUDA/./-} \ - libcublas-${CUDA/./-} \ - cuda-nvrtc-${CUDA/./-} \ - libcufft-${CUDA/./-} \ - libcurand-${CUDA/./-} \ - libcusolver-${CUDA/./-} \ - libcusparse-${CUDA/./-} \ - curl \ - libcudnn8=${CUDNN}-1+cuda${CUDA} \ - libfreetype6-dev \ - libhdf5-serial-dev \ - libzmq3-dev \ - pkg-config \ - software-properties-common \ - unzip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# For CUDA profiling, TensorFlow requires CUPTI. -ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure -# dynamic linker run-time bindings -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ - && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ - && ldconfig - -# See http://bugs.python.org/issue19846 -ENV LANG C.UTF-8 - -# Some TF tools expect a "python" binary -RUN ln -s $(which python3) /usr/local/bin/python - -USER $NB_USER