From 0d1da41ca82a0e90f71e987c25ef196a97f83c51 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 6 Aug 2024 21:37:09 +0800 Subject: [PATCH] Fix docker image layer caching to avoid redundant docker building and transient connection exceptions. (#21612) ### Description Improve docker commands to make docker image layer caching works. It can make docker building faster and more stable. So far, A100 pool's system disk is too small to use docker cache. We won't use pipeline cache for docker image and remove some legacy code. ### Motivation and Context There are often an exception of ``` 64.58 + curl https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.gz -sSL --retry 5 --retry-delay 30 --create-dirs -o /tmp/src/node-v18.17.1-linux-x64.tar.gz --fail 286.4 curl: (92) HTTP/2 stream 0 was not closed cleanly: INTERNAL_ERROR (err 2) ``` Because Onnxruntime pipeline have been sending too many requests to download Nodejs in docker building. Which is the major reason of pipeline failing now In fact, docker image layer caching never works. We can always see the scrips are still running ``` #9 [3/5] RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts #9 0.234 /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) #9 0.235 /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) #9 0.235 /tmp/scripts/install_centos.sh: line 1: !/bin/bash: No such file or directory #9 0.235 ++ '[' '!' -f /etc/yum.repos.d/microsoft-prod.repo ']' #9 0.236 +++ tr -dc 0-9. #9 0.236 +++ cut -d . -f1 #9 0.238 ++ os_major_version=8 .... #9 60.41 + curl https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.gz -sSL --retry 5 --retry-delay 30 --create-dirs -o /tmp/src/node-v18.17.1-linux-x64.tar.gz --fail #9 60.59 + return 0 ... ``` This PR is improving the docker command to make image layer caching work. Thus, CI won't send so many redundant request of downloading NodeJS. ``` #9 [2/5] ADD scripts /tmp/scripts #9 CACHED #10 [3/5] RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts #10 CACHED #11 [4/5] RUN adduser --uid 1000 onnxruntimedev #11 CACHED #12 [5/5] WORKDIR /home/onnxruntimedev #12 CACHED ``` ###Reference https://docs.docker.com/build/drivers/ --------- Co-authored-by: Yi Zhang --- tools/ci_build/get_docker_image.py | 24 ++----- .../azure-pipelines/bigmodels-ci-pipeline.yml | 1 + .../templates/c-api-linux-cpu.yml | 8 +-- .../templates/get-docker-image-steps.yml | 64 ++++++------------- .../inference/aarch64/default/cpu/Dockerfile | 2 +- .../inference/x86_64/default/cpu/Dockerfile | 2 +- 6 files changed, 32 insertions(+), 69 deletions(-) diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py index 99ecaf677f339..a3f603b0beda4 100755 --- a/tools/ci_build/get_docker_image.py +++ b/tools/ci_build/get_docker_image.py @@ -98,17 +98,19 @@ def main(): ) if use_container_registry: + run(args.docker_path, "buildx", "create", "--driver=docker-container", "--name=container_builder") run( args.docker_path, "--log-level", "error", "buildx", "build", - "--push", + "--load", "--tag", full_image_name, - "--cache-from", - full_image_name, + "--cache-from=type=registry,ref=" + full_image_name, + "--builder", + "container_builder", "--build-arg", "BUILDKIT_INLINE_CACHE=1", *shlex.split(args.docker_build_args), @@ -116,24 +118,10 @@ def main(): args.dockerfile, args.context, ) - elif args.use_imagecache: - log.info("Building image with pipeline cache...") run( args.docker_path, - "--log-level", - "error", - "buildx", - "build", - "--tag", - full_image_name, - "--cache-from", + "push", full_image_name, - "--build-arg", - "BUILDKIT_INLINE_CACHE=1", - *shlex.split(args.docker_build_args), - "-f", - args.dockerfile, - args.context, ) else: log.info("Building image...") diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index a66828ee5e188..4a3532dd57fa3 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -321,6 +321,7 @@ stages: --build-arg TRT_VERSION=${{ variables.linux_trt_version }} " Repository: onnxruntimeubi8packagestest_torch + UseImageCacheContainerRegistry: false UpdateDepsTxt: false - task: DownloadPackage@1 diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml index e2b71c5c55fd2..0f4328f75e1bd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml @@ -51,15 +51,15 @@ jobs: Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" - Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} - + Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging + - ${{ if eq(parameters.OnnxruntimeArch, 'aarch64') }}: - template: get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" - Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} + Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging UpdateDepsTxt: false - task: CmdLine@2 @@ -67,7 +67,7 @@ jobs: script: | mkdir -p $HOME/.onnx docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging /bin/bash -c "python3.9 \ /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \ --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/installed" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml index 94cdf042ec62b..5b6769685a972 100644 --- a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml @@ -53,6 +53,7 @@ steps: displayName: patch manylinux - script: | + docker version docker image ls docker system df displayName: Check Docker Images @@ -71,52 +72,25 @@ steps: displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" ContainerRegistry: onnxruntimebuildcache - ${{ if eq(parameters.UseImageCacheContainerRegistry, false) }}: - - task: Cache@2 - displayName: Cache Docker Image Task - inputs: - key: ' "${{ parameters.Repository }}" | "$(Build.SourceVersion)" ' - path: ${{ parameters.IMAGE_CACHE_DIR }} - restoreKeys: | - "${{ parameters.Repository }}" | "$(Build.SourceVersion)" - "${{ parameters.Repository }}" - cacheHitVar: CACHE_RESTORED - condition: eq('${{ parameters.UsePipelineCache }}', 'true') - - - script: | - test -f ${{ parameters.IMAGE_CACHE_DIR }}/cache.tar && docker load -i ${{ parameters.IMAGE_CACHE_DIR }}/cache.tar - docker image ls - displayName: Docker restore - condition: eq('${{ parameters.UsePipelineCache }}', 'true') - - - script: | - if [ ${{ parameters.UsePipelineCache}} ] - then - use_imagecache="--use_imagecache" - else - use_imagecache="" - fi - ${{ parameters.ScriptName }} \ - --dockerfile "${{ parameters.Dockerfile }}" \ - --context "${{ parameters.Context }}" \ - --docker-build-args "${{ parameters.DockerBuildArgs }}" \ - --repository "${{ parameters.Repository }}" \ - $use_imagecache - displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" - - - script: | - set -ex - mkdir -p "${{ parameters.IMAGE_CACHE_DIR }}" - docker save -o "${{ parameters.IMAGE_CACHE_DIR }}/cache.tar" ${{ parameters.Repository }} - docker image ls - docker system df - displayName: Docker save - condition: eq('${{ parameters.UsePipelineCache }}', 'true') + # the difference is no --container-registry + - template: with-container-registry-steps.yml + parameters: + Steps: + - script: | + ${{ parameters.ScriptName }} \ + --dockerfile "${{ parameters.Dockerfile }}" \ + --context "${{ parameters.Context }}" \ + --docker-build-args "${{ parameters.DockerBuildArgs }}" \ + --repository "${{ parameters.Repository }}" + displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" + ContainerRegistry: onnxruntimebuildcache - - script: | - echo ${{ parameters.IMAGE_CACHE_DIR }} - ls -lah ${{ parameters.IMAGE_CACHE_DIR }} - displayName: Display docker dir - condition: eq('${{ parameters.UsePipelineCache }}', 'true') +- script: | + docker version + docker image ls + docker system df + df -h + displayName: Check Docker Images - ${{ if and(eq(parameters.UpdateDepsTxt, true), or(eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29'),eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c'))) }}: - task: PythonScript@0 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 2cd054e6246bc..ca00050121d67 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=arm64v8/almalinux:8 FROM $BASEIMAGE -ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH=/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index caf9583807b62..ef28dde67617f 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=amd64/almalinux:8 FROM $BASEIMAGE -ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH=/usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11