diff --git a/README.md b/README.md index a65d2073..5ab4f76c 100644 --- a/README.md +++ b/README.md @@ -349,18 +349,25 @@ infinite-growth of the VM image count. VM is utilized. It records the usage details, along with a timestamp into the GCE VM image "labels" (metadata). Failure to update metadata is considered critical, and the task will fail to prompt - immediate corrective action by automation maintainers. + immediate corrective action by automation maintainers. When this + container detects it's running on behalf of a release-branch, it + will make a best-effort attempt to flag all VM images for permanent + retention. * `imgobsolete` is triggered periodically by cirrus-cron *only* on this repository. It scans through all GCE VM Images, filtering any which haven't been used within the last 30 days (according to `imgts` - updated labels). Identified images are deprecated by marking them - `obsolete` in GCE. This status blocks them from being used, but - does not actually remove them. + updated labels). Excluding any images which are marked for permanent + retention, disused images are deprecated by marking them as `obsolete` + in GCE. This will cause an error in any CI run which references them. + The images will still be recoverable manually, using the `gcloud` + utility. * `imgprune` also runs periodically, immediately following `imgobsolete`. It scans all currently obsolete GCE images, filtering any which were deprecated more than 30 days ago (according to deprecation metadata). + It will fail with a loud error message should it encounter a image marked + obsolete **and** labeled for permanent retention. Otherwise, Images which have been obsolete for more than 30 days, are permanently removed. diff --git a/imgobsolete/entrypoint.sh b/imgobsolete/entrypoint.sh index 79316c42..4cda517f 100755 --- a/imgobsolete/entrypoint.sh +++ b/imgobsolete/entrypoint.sh @@ -39,10 +39,20 @@ $GCLOUD compute images list --format="$FORMAT" --filter="$FILTER" | \ count_image reason="" created_ymd=$(date --date=$creationTimestamp --iso-8601=date) + permanent=$(egrep --only-matching --max-count=1 --ignore-case 'permanent=true' <<< $labels || true) last_used=$(egrep --only-matching --max-count=1 'last-used=[[:digit:]]+' <<< $labels || true) LABELSFX="labels: '$labels'" + # Any image marked with a `permanent=true` label should be retained forever. + # Typically this will be due to it's use by CI in a release-branch. The images + # `repo-ref` and `build-id` labels should provide clues as to where it's + # required (may be multiple repos.) - for any future auditing purposes. + if [[ -n "$permanent" ]]; then + msg "Retaining forever $name | $labels" + continue + fi + # No label was set if [[ -z "$last_used" ]] then # image lacks any tracking labels diff --git a/imgprune/entrypoint.sh b/imgprune/entrypoint.sh index cd4862a2..8d6e8e2f 100755 --- a/imgprune/entrypoint.sh +++ b/imgprune/entrypoint.sh @@ -39,8 +39,11 @@ $GCLOUD compute images list --show-deprecated \ do count_image reason="" + permanent=$(egrep --only-matching --max-count=1 --ignore-case 'permanent=true' <<< $labels || true) + [[ -z "$permanent" ]] || \ + die 1 "Refusing to delete a deprecated image labeled permanent=true. Please use gcloud utility to set image active, then research the cause of deprecation." [[ "$dep_state" == "OBSOLETE" ]] || \ - die 1 "Error: Unexpected depreciation-state encountered for $name: $dep_state; labels: $labels" + die 1 "Unexpected depreciation-state encountered for $name: $dep_state; labels: $labels" reason="Obsolete as of $del_date; labels: $labels" echo "$name $reason" >> $TODELETE done diff --git a/imgts/Containerfile b/imgts/Containerfile index 6ac521c3..0f24fc86 100644 --- a/imgts/Containerfile +++ b/imgts/Containerfile @@ -3,7 +3,7 @@ FROM quay.io/centos/centos:stream8 # Only needed for installing build-time dependencies COPY /imgts/google-cloud-sdk.repo /etc/yum.repos.d/google-cloud-sdk.repo RUN dnf -y --setopt=keepcache=true update && \ - dnf -y --setopt=keepcache=true install epel-release python3 && \ + dnf -y --setopt=keepcache=true install epel-release python3 jq && \ dnf -y --setopt=keepcache=true --exclude=google-cloud-sdk-366.0.0-1 \ install google-cloud-sdk diff --git a/imgts/entrypoint.sh b/imgts/entrypoint.sh index 21e322ff..fc14cf05 100755 --- a/imgts/entrypoint.sh +++ b/imgts/entrypoint.sh @@ -14,14 +14,17 @@ req_env_var GCPJSON GCPNAME GCPPROJECT IMGNAMES BUILDID REPOREF gcloud_init +# Set this to 1 for testing +DRY_RUN="${DRY_RUN:-0}" + # These must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 -ARGS=" - --update-labels=last-used=$(date +%s) - --update-labels=build-id=$BUILDID - --update-labels=repo-ref=$REPOREF - --update-labels=project=$GCPPROJECT -" +ARGS=(\ + "--update-labels=last-used=$(date +%s)" + "--update-labels=build-id=$BUILDID" + "--update-labels=repo-ref=$REPOREF" + "--update-labels=project=$GCPPROJECT" +) # Must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 @@ -37,11 +40,77 @@ ERRIMGS='' # It's possible for multiple simultaneous label updates to clash CLASHMSG='Labels fingerprint either invalid or resource labels have changed' +# This function accepts a single argument: A Cirrus-CI build ID. The +# function looks up the build from Cirrus-CI to determine if it occured +# on a non-main branch. If so the function returns zero. Otherwise, it +# returns 1 for executions on behalf of the `main` branch, all PRs and +# all tags. It will fully exit non-zero in case of any error. +is_release_branch_image(){ + local buildId api query result prefix branch tag + buildId=$1 + api="https://api.cirrus-ci.com/graphql" + query="{ + \"query\": \"query { + build(id: $buildId) { + branch + tag + pullRequest + } + }\" + }" + + # This is mandatory, must never be unset, empty, or shorter than an actual ID. + # Normally about 16-characters long. + if ((${#buildId}<14)); then + die 1 "Empty/invalid BuildId '$buildId' passed to is_release_branch_image()" + fi + + prefix=".data.build" + result=$(curl --silent --location \ + --request POST --data @- --url "$api" <<<"$query") \ + || \ + die 2 "Error communicating with GraphQL API $api: $result" + + # Any problems with the GraphQL reply or mismatch of the JSON + # structure (specified in query) is an error that operators should + # be made aware of. + if ! jq -e "$prefix" <<<"$result" &> /dev/null; then + die 3 "Response from Cirrus API query '$query' has unexpected/invalid JSON structure: +$result" + fi + + # Cirrus-CI always sets some branch value for all execution contexts + if ! branch=$(jq -e --raw-output "${prefix}.branch" <<<"$result"); then + die 4 "Empty/null branch value returned for build '$buildId': +$result" + fi + + # This value will be empty/null for PRs and branch builds + tag=$(jq --raw-output "${prefix}.tag" <<<"$result" | sed 's/null//g') + + # Cirrus-CI sets `branch=pull/#` for pull-requests, dependabot creates + if [[ -z "$tag" && "$branch" =~ ^(v|release-)[0-9]+.* ]]; then + msg "Found build $buildId for release branch '$branch'." + return 0 + fi + + msg "Found build '$buildId' for non-release branch '$branch' and/or tag '$tag' (may be empty)." + return 1 +} + +unset SET_PERM +if is_release_branch_image $BUILDID; then + ARGS+=("--update-labels=permanent=true") + SET_PERM=1 +fi + +if ((DRY_RUN)); then GCLOUD='echo'; fi + # Must be defined by the cirrus-ci job using the container # shellcheck disable=SC2154 for image in $IMGNAMES do - if ! OUTPUT=$($GCLOUD compute images update "$image" $ARGS 2>&1); then + if ! OUTPUT=$($GCLOUD compute images update "$image" "${ARGS[@]}" 2>&1); then echo "$OUTPUT" > /dev/stderr if grep -iq "$CLASHMSG" <<<"$OUTPUT"; then # Updating the 'last-used' label is most important. @@ -52,7 +121,12 @@ do msg "Detected update error for '$image'" > /dev/stderr ERRIMGS="$ERRIMGS $image" else - echo "$OUTPUT" > /dev/stderr + # Display the URI to the updated image for reference + if ((SET_PERM)); then + msg "IMAGE $image MARKED FOR PERMANENT RETENTION" + else + echo "Updated image $image last-used timestamp" + fi fi done