From d7c8598de43cd529cf607bfbfed497ec0817a342 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 21 Jul 2022 15:42:40 -0400 Subject: [PATCH] Help preserve release-branch CI VM Images For release-branches, CI VM images must be retained long-term since they are difficult/impossible to rebuild. A number of times, often due to human error, these images have been accidentally lost. Update automation tooling such that these images are specially marked by the timestamp-updating container that runs with every CI build. Later, when another container runs to check for disused images, ensure the specially marked images are never deprecated or removed. Finally when the deletion container runs, if a deprecated image is found specially marked, issue a loud error that will be delivered to the podman-monitor list. Update documentation to reflect these changes. Signed-off-by: Chris Evich --- README.md | 15 +++++-- imgobsolete/entrypoint.sh | 10 +++++ imgprune/entrypoint.sh | 5 ++- imgts/Containerfile | 2 +- imgts/entrypoint.sh | 90 +++++++++++++++++++++++++++++++++++---- 5 files changed, 108 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index a65d2073..5ab4f76c 100644 --- a/README.md +++ b/README.md @@ -349,18 +349,25 @@ infinite-growth of the VM image count. VM is utilized. It records the usage details, along with a timestamp into the GCE VM image "labels" (metadata). Failure to update metadata is considered critical, and the task will fail to prompt - immediate corrective action by automation maintainers. + immediate corrective action by automation maintainers. When this + container detects it's running on behalf of a release-branch, it + will make a best-effort attempt to flag all VM images for permanent + retention. * `imgobsolete` is triggered periodically by cirrus-cron *only* on this repository. It scans through all GCE VM Images, filtering any which haven't been used within the last 30 days (according to `imgts` - updated labels). Identified images are deprecated by marking them - `obsolete` in GCE. This status blocks them from being used, but - does not actually remove them. + updated labels). Excluding any images which are marked for permanent + retention, disused images are deprecated by marking them as `obsolete` + in GCE. This will cause an error in any CI run which references them. + The images will still be recoverable manually, using the `gcloud` + utility. * `imgprune` also runs periodically, immediately following `imgobsolete`. It scans all currently obsolete GCE images, filtering any which were deprecated more than 30 days ago (according to deprecation metadata). + It will fail with a loud error message should it encounter a image marked + obsolete **and** labeled for permanent retention. Otherwise, Images which have been obsolete for more than 30 days, are permanently removed. diff --git a/imgobsolete/entrypoint.sh b/imgobsolete/entrypoint.sh index 79316c42..4cda517f 100755 --- a/imgobsolete/entrypoint.sh +++ b/imgobsolete/entrypoint.sh @@ -39,10 +39,20 @@ $GCLOUD compute images list --format="$FORMAT" --filter="$FILTER" | \ count_image reason="" created_ymd=$(date --date=$creationTimestamp --iso-8601=date) + permanent=$(egrep --only-matching --max-count=1 --ignore-case 'permanent=true' <<< $labels || true) last_used=$(egrep --only-matching --max-count=1 'last-used=[[:digit:]]+' <<< $labels || true) LABELSFX="labels: '$labels'" + # Any image marked with a `permanent=true` label should be retained forever. + # Typically this will be due to it's use by CI in a release-branch. The images + # `repo-ref` and `build-id` labels should provide clues as to where it's + # required (may be multiple repos.) - for any future auditing purposes. + if [[ -n "$permanent" ]]; then + msg "Retaining forever $name | $labels" + continue + fi + # No label was set if [[ -z "$last_used" ]] then # image lacks any tracking labels diff --git a/imgprune/entrypoint.sh b/imgprune/entrypoint.sh index cd4862a2..8d6e8e2f 100755 --- a/imgprune/entrypoint.sh +++ b/imgprune/entrypoint.sh @@ -39,8 +39,11 @@ $GCLOUD compute images list --show-deprecated \ do count_image reason="" + permanent=$(egrep --only-matching --max-count=1 --ignore-case 'permanent=true' <<< $labels || true) + [[ -z "$permanent" ]] || \ + die 1 "Refusing to delete a deprecated image labeled permanent=true. Please use gcloud utility to set image active, then research the cause of deprecation." [[ "$dep_state" == "OBSOLETE" ]] || \ - die 1 "Error: Unexpected depreciation-state encountered for $name: $dep_state; labels: $labels" + die 1 "Unexpected depreciation-state encountered for $name: $dep_state; labels: $labels" reason="Obsolete as of $del_date; labels: $labels" echo "$name $reason" >> $TODELETE done diff --git a/imgts/Containerfile b/imgts/Containerfile index 6ac521c3..0f24fc86 100644 --- a/imgts/Containerfile +++ b/imgts/Containerfile @@ -3,7 +3,7 @@ FROM quay.io/centos/centos:stream8 # Only needed for installing build-time dependencies COPY /imgts/google-cloud-sdk.repo /etc/yum.repos.d/google-cloud-sdk.repo RUN dnf -y --setopt=keepcache=true update && \ - dnf -y --setopt=keepcache=true install epel-release python3 && \ + dnf -y --setopt=keepcache=true install epel-release python3 jq && \ dnf -y --setopt=keepcache=true --exclude=google-cloud-sdk-366.0.0-1 \ install google-cloud-sdk diff --git a/imgts/entrypoint.sh b/imgts/entrypoint.sh index 21e322ff..fc14cf05 100755 --- a/imgts/entrypoint.sh +++ b/imgts/entrypoint.sh @@ -14,14 +14,17 @@ req_env_var GCPJSON GCPNAME GCPPROJECT IMGNAMES BUILDID REPOREF gcloud_init +# Set this to 1 for testing +DRY_RUN="${DRY_RUN:-0}" + # These must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 -ARGS=" - --update-labels=last-used=$(date +%s) - --update-labels=build-id=$BUILDID - --update-labels=repo-ref=$REPOREF - --update-labels=project=$GCPPROJECT -" +ARGS=(\ + "--update-labels=last-used=$(date +%s)" + "--update-labels=build-id=$BUILDID" + "--update-labels=repo-ref=$REPOREF" + "--update-labels=project=$GCPPROJECT" +) # Must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 @@ -37,11 +40,77 @@ ERRIMGS='' # It's possible for multiple simultaneous label updates to clash CLASHMSG='Labels fingerprint either invalid or resource labels have changed' +# This function accepts a single argument: A Cirrus-CI build ID. The +# function looks up the build from Cirrus-CI to determine if it occured +# on a non-main branch. If so the function returns zero. Otherwise, it +# returns 1 for executions on behalf of the `main` branch, all PRs and +# all tags. It will fully exit non-zero in case of any error. +is_release_branch_image(){ + local buildId api query result prefix branch tag + buildId=$1 + api="https://api.cirrus-ci.com/graphql" + query="{ + \"query\": \"query { + build(id: $buildId) { + branch + tag + pullRequest + } + }\" + }" + + # This is mandatory, must never be unset, empty, or shorter than an actual ID. + # Normally about 16-characters long. + if ((${#buildId}<14)); then + die 1 "Empty/invalid BuildId '$buildId' passed to is_release_branch_image()" + fi + + prefix=".data.build" + result=$(curl --silent --location \ + --request POST --data @- --url "$api" <<<"$query") \ + || \ + die 2 "Error communicating with GraphQL API $api: $result" + + # Any problems with the GraphQL reply or mismatch of the JSON + # structure (specified in query) is an error that operators should + # be made aware of. + if ! jq -e "$prefix" <<<"$result" &> /dev/null; then + die 3 "Response from Cirrus API query '$query' has unexpected/invalid JSON structure: +$result" + fi + + # Cirrus-CI always sets some branch value for all execution contexts + if ! branch=$(jq -e --raw-output "${prefix}.branch" <<<"$result"); then + die 4 "Empty/null branch value returned for build '$buildId': +$result" + fi + + # This value will be empty/null for PRs and branch builds + tag=$(jq --raw-output "${prefix}.tag" <<<"$result" | sed 's/null//g') + + # Cirrus-CI sets `branch=pull/#` for pull-requests, dependabot creates + if [[ -z "$tag" && "$branch" =~ ^(v|release-)[0-9]+.* ]]; then + msg "Found build $buildId for release branch '$branch'." + return 0 + fi + + msg "Found build '$buildId' for non-release branch '$branch' and/or tag '$tag' (may be empty)." + return 1 +} + +unset SET_PERM +if is_release_branch_image $BUILDID; then + ARGS+=("--update-labels=permanent=true") + SET_PERM=1 +fi + +if ((DRY_RUN)); then GCLOUD='echo'; fi + # Must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 for image in $IMGNAMES do - if ! OUTPUT=$($GCLOUD compute images update "$image" $ARGS 2>&1); then + if ! OUTPUT=$($GCLOUD compute images update "$image" "${ARGS[@]}" 2>&1); then echo "$OUTPUT" > /dev/stderr if grep -iq "$CLASHMSG" <<<"$OUTPUT"; then # Updating the 'last-used' label is most important. @@ -52,7 +121,12 @@ do msg "Detected update error for '$image'" > /dev/stderr ERRIMGS="$ERRIMGS $image" else - echo "$OUTPUT" > /dev/stderr + # Display the URI to the updated image for reference + if ((SET_PERM)); then + msg "IMAGE $image MARKED FOR PERMANENT RETENTION" + else + echo "Updated image $image last-used timestamp" + fi fi done