From b007a923720d5348948531e5710d847eb2a5bbb5 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 22 Aug 2024 11:12:00 -0500 Subject: [PATCH] remove delete-temp-images job (#709) Follow-up to #708. Proposes completely removing the `delete-temp-images` job, in favor of relying on the scheduled nightly cleanup at https://github.com/rapidsai/workflows/blob/main/.github/workflows/cleanup_staging.yaml. ## Notes for Reviewers ### Details CI here writes images to the `rapidsai/staging` repo on DockerHub, then later copies them to individual user-facing repos. To avoid those temporary CI artifacts piling up in the `rapidsai/staging` repo, pull requests and branch builds run a job called `delete-temp-images` which does what it sounds like. In exchange for more aggressive cleaning, this job introduces significant complexity for development here. Most notably, we've observed several instances where that job deletes images before all CI jobs needing them have completed successfully, leading to all of CI needing to be re-run. Significant effort has been put into trying to avoid that, and we've found it's been difficult to get it right: some attempts: * #702 * #708 a recent example: * https://github.com/rapidsai/docker/pull/696#issuecomment-2299835638 ### Ok so how will we clean up? The workflow at https://github.com/rapidsai/workflows/blob/main/.github/workflows/cleanup_staging.yaml. It runs once a day and deletes anything from `rapidsai/staging` that's more than 30 days old. ### Benefits of these changes As described in https://github.com/rapidsai/docker/pull/708#discussion_r1718850401 ... CI here will work as it does in other RAPIDS repos.... if any jobs fail for retryable reasons (like network issues), you can safely click "re-run failed jobs" and make incremental progress towards all builds passing. Also reduces the need to maintain code that has to keep up with the DockerHub API in two places (by deleting `ci/delete-temp-images.sh` here). Authors: - James Lamb (https://github.com/jameslamb) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - https://github.com/jakirkham URL: https://github.com/rapidsai/docker/pull/709 --- .../workflows/build-test-publish-images.yml | 35 ------------- CONTRIBUTING.md | 11 ++++ ci/delete-temp-images.sh | 50 ------------------- matrix-test.yaml | 6 +-- matrix.yaml | 1 - raft-ann-bench/cpu/Dockerfile | 2 +- 6 files changed, 15 insertions(+), 90 deletions(-) delete mode 100755 ci/delete-temp-images.sh diff --git a/.github/workflows/build-test-publish-images.yml b/.github/workflows/build-test-publish-images.yml index 91d26630..848dc26c 100644 --- a/.github/workflows/build-test-publish-images.yml +++ b/.github/workflows/build-test-publish-images.yml @@ -39,7 +39,6 @@ jobs: - build - build-multiarch-manifest - test - - delete-temp-images secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 checks: @@ -249,37 +248,3 @@ jobs: cuda${{ matrix.CUDA_VER }}-\ py${{ matrix.PYTHON_VER }}-\ ${{ matrix.ARCH }}" - delete-temp-images: - if: ${{ !cancelled() && (needs.test.result == 'success' || needs.test.result == 'skipped') }} - needs: [compute-matrix, build, build-multiarch-manifest, test] - strategy: - matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} - fail-fast: false - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Remove temporary images - shell: bash - env: - RAFT_ANN_BENCH_CPU_IMAGE_BUILT: ${{ matrix.BUILD_RAFT_ANN_BENCH_CPU_IMAGE }} - BASE_IMAGE_REPO: ${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }} - BASE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }} - RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }} - ALPHA_TAG: ${{ needs.compute-matrix.outputs.ALPHA_TAG }} - CUDA_TAG: ${{ matrix.CUDA_TAG }} - PYTHON_VER: ${{ matrix.PYTHON_VER }} - NOTEBOOKS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.NOTEBOOKS_IMAGE_REPO }} - NOTEBOOKS_TAG_PREFIX: ${{ needs.compute-matrix.outputs.NOTEBOOKS_TAG_PREFIX }} - RAFT_ANN_BENCH_IMAGE_REPO: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_IMAGE_REPO }} - RAFT_ANN_BENCH_TAG_PREFIX: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_TAG_PREFIX }} - RAFT_ANN_BENCH_DATASETS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_DATASETS_IMAGE_REPO }} - RAFT_ANN_BENCH_DATASETS_TAG_PREFIX: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_DATASETS_TAG_PREFIX }} - RAFT_ANN_BENCH_CPU_IMAGE_REPO: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_CPU_IMAGE_REPO }} - RAFT_ANN_BENCH_CPU_TAG_PREFIX: ${{ needs.compute-matrix.outputs.RAFT_ANN_BENCH_CPU_TAG_PREFIX }} - GPUCIBOT_DOCKERHUB_USER: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }} - GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }} - ARCHES: ${{ toJSON(matrix.ARCHES) }} - run: ci/delete-temp-images.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9b7a9c9a..50627781 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,3 +15,14 @@ To build just the `base` image with default arguments: `docker buildx build --pu - `CUDA_VER` - Version of CUDA to use. Should be `major.minor.patch` - `PYTHON_VER` - Version of Python to use. Should be `major.minor` - `RAPIDS_VER` - Version of RAPIDS to use. Should be `YY.MM` + +## Cleaning Up + +Every build first writes images to the https://hub.docker.com/r/rapidsai/staging repo on DockerHub, +then pushes them on to the individual repos like `rapidsai/base`, `rapidsai/notebooks`, etc. + +A scheduled job regularly deletes old images from that `rapidsai/staging` repo. +See https://github.com/rapidsai/workflows/blob/main/.github/workflows/cleanup_staging.yaml for details. + +If you come back to a pull requests here after more than a few days and find that jobs are failing with errors +that suggest that some necessary images don't exist, re-run all of CI on that pull request to produce new images. diff --git a/ci/delete-temp-images.sh b/ci/delete-temp-images.sh deleted file mode 100755 index ed1d9c59..00000000 --- a/ci/delete-temp-images.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -set -eEuo pipefail - -# Authenticate and retrieve DockerHub token -HUB_TOKEN=$( -curl -s -H "Content-Type: application/json" \ - -X POST \ - -d "{\"username\": \"$GPUCIBOT_DOCKERHUB_USER\", \"password\": \"$GPUCIBOT_DOCKERHUB_TOKEN\"}" \ - https://hub.docker.com/v2/users/login/ | jq -r .token \ -) -echo "::add-mask::${HUB_TOKEN}" - -org="rapidsai" - -# Define tag arrays for different images -base_tag="${BASE_TAG_PREFIX}${RAPIDS_VER}${ALPHA_TAG}-cuda${CUDA_TAG}-py${PYTHON_VER}" -notebooks_tag="${NOTEBOOKS_TAG_PREFIX}${RAPIDS_VER}${ALPHA_TAG}-cuda${CUDA_TAG}-py${PYTHON_VER}" -raft_ann_bench_tag="${RAFT_ANN_BENCH_TAG_PREFIX}${RAPIDS_VER}${ALPHA_TAG}-cuda${CUDA_TAG}-py${PYTHON_VER}" -raft_ann_bench_datasets_tag="${RAFT_ANN_BENCH_DATASETS_TAG_PREFIX}${RAPIDS_VER}${ALPHA_TAG}-cuda${CUDA_TAG}-py${PYTHON_VER}" -raft_ann_bench_cpu_tag="${RAFT_ANN_BENCH_CPU_TAG_PREFIX}${RAPIDS_VER}${ALPHA_TAG}-py${PYTHON_VER}" - -for arch in $(echo "${ARCHES}" | jq .[] -r); do - curl -i -X DELETE \ - -H "Accept: application/json" \ - -H "Authorization: JWT $HUB_TOKEN" \ - "https://hub.docker.com/v2/repositories/$org/$BASE_IMAGE_REPO/tags/$base_tag-$arch/" - - curl -i -X DELETE \ - -H "Accept: application/json" \ - -H "Authorization: JWT $HUB_TOKEN" \ - "https://hub.docker.com/v2/repositories/$org/$NOTEBOOKS_IMAGE_REPO/tags/$notebooks_tag-$arch/" - - curl -i -X DELETE \ - -H "Accept: application/json" \ - -H "Authorization: JWT $HUB_TOKEN" \ - "https://hub.docker.com/v2/repositories/$org/$RAFT_ANN_BENCH_IMAGE_REPO/tags/$raft_ann_bench_tag-$arch/" - - curl -i -X DELETE \ - -H "Accept: application/json" \ - -H "Authorization: JWT $HUB_TOKEN" \ - "https://hub.docker.com/v2/repositories/$org/$RAFT_ANN_BENCH_DATASETS_IMAGE_REPO/tags/$raft_ann_bench_datasets_tag-$arch/" - - if [ "$RAFT_ANN_BENCH_CPU_IMAGE_BUILT" = "true" ]; then - curl -i -X DELETE \ - -H "Accept: application/json" \ - -H "Authorization: JWT $HUB_TOKEN" \ - "https://hub.docker.com/v2/repositories/$org/$RAFT_ANN_BENCH_CPU_IMAGE_REPO/tags/$raft_ann_bench_cpu_tag-$arch/" - fi -done diff --git a/matrix-test.yaml b/matrix-test.yaml index f8ffdc60..a5e1ad18 100644 --- a/matrix-test.yaml +++ b/matrix-test.yaml @@ -1,13 +1,13 @@ # CUDA_VER is `.` (e.g. `12.0`) pull-request: - - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.9', GPU: 'v100', DRIVER: 'earliest' } + - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.10', GPU: 'v100', DRIVER: 'earliest' } - { CUDA_VER: '12.0', ARCH: 'amd64', PYTHON_VER: '3.10', GPU: 'v100', DRIVER: 'latest' } - { CUDA_VER: '12.2', ARCH: 'arm64', PYTHON_VER: '3.11', GPU: 'a100', DRIVER: 'latest' } - { CUDA_VER: '12.5', ARCH: 'amd64', PYTHON_VER: '3.11', GPU: 'v100', DRIVER: 'latest' } branch: - - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.9', GPU: 'v100', DRIVER: 'earliest' } - - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.9', GPU: 'v100', DRIVER: 'latest' } + - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.10', GPU: 'v100', DRIVER: 'earliest' } + - { CUDA_VER: '11.8', ARCH: 'amd64', PYTHON_VER: '3.10', GPU: 'v100', DRIVER: 'latest' } - { CUDA_VER: '12.0', ARCH: 'amd64', PYTHON_VER: '3.10', GPU: 'v100', DRIVER: 'latest' } - { CUDA_VER: '12.0', ARCH: 'arm64', PYTHON_VER: '3.10', GPU: 'a100', DRIVER: 'latest' } - { CUDA_VER: '12.2', ARCH: 'amd64', PYTHON_VER: '3.11', GPU: 'v100', DRIVER: 'latest' } diff --git a/matrix.yaml b/matrix.yaml index 0e0c0c0b..2df9844b 100644 --- a/matrix.yaml +++ b/matrix.yaml @@ -4,6 +4,5 @@ CUDA_VER: # Should be `..` (e.g. `11.2.2`) - "12.2.2" - "12.5.1" PYTHON_VER: - - "3.9" - "3.10" - "3.11" diff --git a/raft-ann-bench/cpu/Dockerfile b/raft-ann-bench/cpu/Dockerfile index af0e9823..1910ede7 100644 --- a/raft-ann-bench/cpu/Dockerfile +++ b/raft-ann-bench/cpu/Dockerfile @@ -23,7 +23,7 @@ EOF # we need perl temporarily for the remaining benchmark perl scripts RUN apt-get install perl -y -# Install python before updating environment, otherwise Python 3.9 image +# Install python before updating environment, otherwise Python 3 image # runs into a solver conflict with truststore 0.8.0. This avoids the environment installing # packages incompatible with python version needed before python itself is pinned to the correct version. RUN <