From 980d5b3622a7ac5cb7cb0cc6c76ec147affc4361 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 20 Oct 2022 10:16:23 -0400 Subject: [PATCH 1/3] GHA: Simplify script reference This workflow was originally crafted to be (somehow) reused with different scripts. That never happened and the extra indirection is confusing and hard to maintain. Remove it. Signed-off-by: Chris Evich --- .github/workflows/check_cirrus_cron.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_cirrus_cron.yml b/.github/workflows/check_cirrus_cron.yml index a6d00227a3..51e5c40aab 100644 --- a/.github/workflows/check_cirrus_cron.yml +++ b/.github/workflows/check_cirrus_cron.yml @@ -38,7 +38,7 @@ jobs: - name: Get failed cron names and Build IDs id: cron - run: './.github/actions/${{ github.workflow }}/${{ github.job }}.sh' + run: './.github/actions/check_cirrus_cron/cron_failures.sh' - if: steps.cron.outputs.failures > 0 shell: bash From 3a85d537b6073322b1a78be46c2bcc59d2c199c1 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 20 Oct 2022 10:37:21 -0400 Subject: [PATCH 2/3] GHA: Migrate inline script to file Inline scripts make github-action workflow YAML harder to read/maintain. Relocate the e-mail formation script to a dedicated file. This also permits better input-validation and re-use of a common `err()` function. Signed-off-by: Chris Evich --- .../check_cirrus_cron/cron_failures.sh | 8 ++--- .github/actions/check_cirrus_cron/lib.sh | 7 ++++ .../check_cirrus_cron/make_email_body.sh | 35 +++++++++++++++++++ .github/workflows/check_cirrus_cron.yml | 19 +--------- 4 files changed, 45 insertions(+), 24 deletions(-) create mode 100644 .github/actions/check_cirrus_cron/lib.sh create mode 100755 .github/actions/check_cirrus_cron/make_email_body.sh diff --git a/.github/actions/check_cirrus_cron/cron_failures.sh b/.github/actions/check_cirrus_cron/cron_failures.sh index f4dddff8bc..1efe57c145 100755 --- a/.github/actions/check_cirrus_cron/cron_failures.sh +++ b/.github/actions/check_cirrus_cron/cron_failures.sh @@ -5,11 +5,7 @@ set -eo pipefail # Intended to be executed from a github action workflow step. # Outputs the Cirrus cron names and IDs of any failed builds -err() { - # Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions - echo "::error file=${BASH_SOURCE[0]},line=${BASH_LINENO[0]}::${1:-No error message given}" - exit 1 -} +source $(dirname "${BASH_SOURCE[0]}")/lib.sh _errfmt="Expecting %s value to not be empty" if [[ -z "$GITHUB_REPOSITORY" ]]; then @@ -118,5 +114,5 @@ cat "$NAME_ID_FILEPATH" records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1) # Always two words per record failures=$((records/2)) -echo "::set-output name=failures::$failures" +echo "failures::$failures" >> $GITHUB_OUTPUT echo "Total failed Cirrus-CI cron builds: $failures" diff --git a/.github/actions/check_cirrus_cron/lib.sh b/.github/actions/check_cirrus_cron/lib.sh new file mode 100644 index 0000000000..1838798dd1 --- /dev/null +++ b/.github/actions/check_cirrus_cron/lib.sh @@ -0,0 +1,7 @@ + +# Must be called from top-level of script, not another function. +err() { + # Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions + echo "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[1]}::${1:-No error message given}" + exit 1 +} diff --git a/.github/actions/check_cirrus_cron/make_email_body.sh b/.github/actions/check_cirrus_cron/make_email_body.sh new file mode 100755 index 0000000000..f88803da9f --- /dev/null +++ b/.github/actions/check_cirrus_cron/make_email_body.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eo pipefail + +# Intended to be executed from a github action workflow step. +# Input: File listing space separated failed cron build names and IDs +# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file + +source $(dirname "${BASH_SOURCE[0]}")/lib.sh + +_errfmt="Expecting %s value to not be empty" +if [[ -z "$GITHUB_REPOSITORY" ]]; then + err $(printf "$_errfmt" "\$GITHUB_REPOSITORY") +elif [[ -z "$GITHUB_WORKFLOW" ]]; then + err $(printf "$_errfmt" "\$GITHUB_WORKFLOW") +elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then + _errfmt="Expecting %s value to be a readable file" + err $(printf "$_errfmt" "\$NAME_ID_FILEPATH") +fi + +mkdir -p artifacts +( + echo "Detected one or more Cirrus-CI cron-triggered jobs have failed recently:" + echo "" + + while read -r NAME BID; do + echo "Cron build '$NAME' Failed: https://cirrus-ci.com/build/$BID" + done < "$NAME_ID_FILEPATH" + + echo "" + echo "# Source: ${GITHUB_WORKFLOW} workflow on ${GITHUB_REPOSITORY}." + # Separate content from sendgrid.com automatic footer. + echo "" + echo "" +) > ./artifacts/email_body.txt diff --git a/.github/workflows/check_cirrus_cron.yml b/.github/workflows/check_cirrus_cron.yml index 51e5c40aab..1162ba2e76 100644 --- a/.github/workflows/check_cirrus_cron.yml +++ b/.github/workflows/check_cirrus_cron.yml @@ -42,24 +42,7 @@ jobs: - if: steps.cron.outputs.failures > 0 shell: bash - # Must be inline, since context expressions are used. - # Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/context-and-expression-syntax-for-github-actions - run: | - set -eo pipefail - ( - echo "Detected one or more Cirrus-CI cron-triggered jobs have failed recently:" - echo "" - - while read -r NAME BID; do - echo "Cron build '$NAME' Failed: https://cirrus-ci.com/build/$BID" - done < "$NAME_ID_FILEPATH" - - echo "" - echo "# Source: ${{ github.workflow }} workflow on ${{ github.repository }}." - # Separate content from sendgrid.com automatic footer. - echo "" - echo "" - ) > ./artifacts/email_body.txt + run: './.github/actions/check_cirrus_cron/make_email_body' - if: steps.cron.outputs.failures > 0 name: Send failure notification e-mail From 35523d560a97152cad3e83cebfe86c49512b87ac Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 20 Oct 2022 13:21:04 -0400 Subject: [PATCH 3/3] GHA: Auto. re-run failed cirrus-cron builds once With a seemingly ever growing list of cirrus-cron jobs running on release branches, there are bound to be some hiccups. Sometimes a lot of them. Normally any failures require a human to eyeball the logs and/or manually re-run the job to see if it was simply a flake. This doesn't take long, but can be distracting and compounds over time. Attempt to alleviate some maintainer burden by using a new github action workflow to perform **one** automatic re-run on any failed builds. This task is scheduled an hour prior to a second failure check, and generation of notification e-mail for review. Note: If there are no failures, due to the auto. re-run or luck, no e-mail is generated. If this proves useful in this repo, I intend to re-use this workflow for other repo's cirrus-cron jobs. Signed-off-by: Chris Evich --- .../check_cirrus_cron/cron_failures.sh | 63 +++------- .github/actions/check_cirrus_cron/lib.sh | 70 ++++++++++- .../check_cirrus_cron/make_email_body.sh | 3 +- .../check_cirrus_cron/rerun_failed_tasks.sh | 112 ++++++++++++++++++ .github/workflows/rerun_cirrus_cron.yml | 61 ++++++++++ 5 files changed, 261 insertions(+), 48 deletions(-) create mode 100755 .github/actions/check_cirrus_cron/rerun_failed_tasks.sh create mode 100644 .github/workflows/rerun_cirrus_cron.yml diff --git a/.github/actions/check_cirrus_cron/cron_failures.sh b/.github/actions/check_cirrus_cron/cron_failures.sh index 1efe57c145..0e669c19c1 100755 --- a/.github/actions/check_cirrus_cron/cron_failures.sh +++ b/.github/actions/check_cirrus_cron/cron_failures.sh @@ -8,31 +8,25 @@ set -eo pipefail source $(dirname "${BASH_SOURCE[0]}")/lib.sh _errfmt="Expecting %s value to not be empty" -if [[ -z "$GITHUB_REPOSITORY" ]]; then +if [[ -z "$GITHUB_REPOSITORY" ]]; then # / err $(printf "$_errfmt" "\$GITHUB_REPOSITORY") -elif [[ -z "$NAME_ID_FILEPATH" ]]; then +elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath err $(printf "$_errfmt" "\$NAME_ID_FILEPATH") fi mkdir -p artifacts cat > ./artifacts/query_raw.json << "EOF" -{"query":" - query CronNameStatus($owner: String!, $repo: String!) { - ownerRepository(platform: \"LINUX\", owner: $owner, name: $repo) { - cronSettings { - name - lastInvocationBuild { - id - status - } +query { + ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") { + cronSettings { + name + lastInvocationBuild { + id + status } } } -", -"variables":"{ - \"owner\": \"@@OWNER@@\", - \"repo\": \"@@REPO@@\" -}"} +} EOF # Makes for easier copy/pasting query to/from # https://cirrus-ci.com/explorer @@ -40,7 +34,6 @@ owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY") repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY") sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json -echo "::group::Posting GraphQL Query" # Easier to debug in error-reply when query is compacted tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \ jq --indent 4 --color-output . @@ -48,21 +41,13 @@ tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json if grep -q '@@' ./artifacts/query.json; then err "Found unreplaced substitution token in raw query JSON" fi -curl \ - --request POST \ - --silent \ - --location \ - --header 'content-type: application/json' \ - --url 'https://api.cirrus-ci.com/graphql' \ - --data @./artifacts/query.json \ - --output ./artifacts/reply.json -echo "::endgroup::" -echo "::group::Received GraphQL Reply" -jq --indent 4 --color-output . <./artifacts/reply.json || \ - cat ./artifacts/reply.json -echo "::endgroup::" +# The query should never ever return an empty-list, unless there are no cirrus-cron +# jobs defined for the repository. In that case, this monitoring script shouldn't +# be running anyway. +filt_head='.data.ownerRepository.cronSettings' +gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json # e.x. reply.json # { # "data": { @@ -87,22 +72,8 @@ echo "::endgroup::" # "lastInvocationBuild": { # "id": "5003065549914112", # "status": "FAILED" -# } # } -# ] -# } -# } -# } - -# This should never ever return an empty-list, unless there are no cirrus-cron -# jobs defined for the repository. In that case, this monitoring script shouldn't -# be running anyway. -filt_head='.data.ownerRepository.cronSettings' -if ! jq -e "$filt_head" ./artifacts/reply.json &> /dev/null -then - # Actual colorized JSON reply was printed above - err "Null/empty result filtering reply with '$filt_head'" -fi +# ... filt="$filt_head | map(select(.lastInvocationBuild.status==\"FAILED\") | { name:.name, id:.lastInvocationBuild.id} | join(\" \")) | join(\"\n\")" jq --raw-output "$filt" ./artifacts/reply.json > "$NAME_ID_FILEPATH" @@ -114,5 +85,7 @@ cat "$NAME_ID_FILEPATH" records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1) # Always two words per record failures=$((records/2)) +# Set the output of this step. +# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter echo "failures::$failures" >> $GITHUB_OUTPUT echo "Total failed Cirrus-CI cron builds: $failures" diff --git a/.github/actions/check_cirrus_cron/lib.sh b/.github/actions/check_cirrus_cron/lib.sh index 1838798dd1..70f08f8099 100644 --- a/.github/actions/check_cirrus_cron/lib.sh +++ b/.github/actions/check_cirrus_cron/lib.sh @@ -1,7 +1,75 @@ + +# Send text to stderr +msg() { + echo "$@" > /dev/stderr +} + # Must be called from top-level of script, not another function. err() { # Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions - echo "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[1]}::${1:-No error message given}" + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@" exit 1 } + +# Using python3 here is a compromise for readability and +# properly handling quote, control and unicode character encoding. +escape_query() { + local json_string + # Assume it's okay to squash repeated whitespaces inside the query + json_string=$(printf '%s' "$1" | \ + tr --delete '\r\n' | \ + tr --squeeze-repeats '[[:space:]]' | \ + python3 -c 'import sys,json; print(json.dumps(sys.stdin.read()))') + # The $json_string in message is already quoted + echo -n "$json_string" +} + +# Given a GraphQL query/mutation, fire it at the API. +# and return the output on stdout. The optional +# second parameter may contain a jq filter-string. +# When provided, if the GQL result is empty, null, +# fails to parse, or does not match the filter-string, +# non-zero will be returned. +gql() { + local e_query query + e_query=$(escape_query "$1") + query="{\"query\": $e_query}" + local filter + filter="$2" + local output + local filtered + msg "::group::Posting GraphQL Query and checking result" + msg "query: " + if ! jq -e . <<<"$query" > /dev/stderr; then + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query" + return 1 + fi + if output=$(curl \ + --request POST \ + --silent \ + --show-error \ + --location \ + --header 'content-type: application/json' \ + --header "Authorization: Bearer $SECRET_CIRRUS_API_KEY" \ + --url 'https://api.cirrus-ci.com/graphql' \ + --data "$query") && [[ -n "$output" ]]; then + + if filtered=$(jq -e "$filter" <<<"$output") && [[ -n "$filtered" ]]; then + msg "result:" + # Make debugging easier w/ formatted output + # to stderr for display, stdout for consumption by caller + jq --indent 2 . <<<"$output" | tee /dev/stderr + msg "::endgroup::" + return 0 + fi + + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query result did not pass filter '$2': '$output'" + msg "::endgroup::" + return 2 + fi + + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query failed or result empty: '$output'" + msg "::endgroup::" + return 3 +} diff --git a/.github/actions/check_cirrus_cron/make_email_body.sh b/.github/actions/check_cirrus_cron/make_email_body.sh index f88803da9f..ab5a717eb0 100755 --- a/.github/actions/check_cirrus_cron/make_email_body.sh +++ b/.github/actions/check_cirrus_cron/make_email_body.sh @@ -14,8 +14,7 @@ if [[ -z "$GITHUB_REPOSITORY" ]]; then elif [[ -z "$GITHUB_WORKFLOW" ]]; then err $(printf "$_errfmt" "\$GITHUB_WORKFLOW") elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then - _errfmt="Expecting %s value to be a readable file" - err $(printf "$_errfmt" "\$NAME_ID_FILEPATH") + err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file" fi mkdir -p artifacts diff --git a/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh b/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh new file mode 100755 index 0000000000..8432815d03 --- /dev/null +++ b/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +set -eo pipefail + +# Intended to be executed from a github action workflow step. +# Input: File listing space separated failed cron build names and IDs +# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file +# +# HOW TO TEST: This script may be manually tested assuming you have +# access to the github containers-org. Cirrus API key. With that in-hand, +# this script may be manually run by: +# 1. export SECRET_CIRRUS_API_KEY= +# 2. Find an old podman build that failed on `main` or another **branch**. +# For example, from https://cirrus-ci.com/github/containers/podman/main +# (pick an old one from the bottom, since re-running it won't affect anybody) +# 3. Create a temp. file, like /tmp/fail with a single line, of the form: +# +# 4. export NAME_ID_FILEPATH=/tmp/fail +# 5. execute this script, and refresh the build in the WebUI, all unsuccessful +# tasks should change status to running or scheduled. Note: some later +# tasks may remain red as they wait for dependencies to run and pass. +# 6. After each run, cleanup with 'rm -rf ./artifacts' +# (unless you want to examine them) + +source $(dirname "${BASH_SOURCE[0]}")/lib.sh + +_errfmt="Expecting %s value to not be empty" +if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then + err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY") +elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh + err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH") +fi + +mkdir -p artifacts +# If there are no tasks, don't fail reading the file +truncate -s 0 ./artifacts/rerun_tids.txt + +cat "$NAME_ID_FILEPATH" | \ + while read -r NAME BID; do + if [[ -z "$NAME" ]]; then + err $(printf "$_errfmt" "\$NAME") + elif [[ -z "$BID" ]]; then + err $(printf "$_errfmt" "\$BID") + fi + + id_status_q=" + query { + build(id: \"$BID\") { + tasks { + id, + status + } + } + } + " + task_id_status=$(gql "$id_status_q" '.data.build.tasks[0]') + # Expected query result like: + # { + # "data": { + # "build": { + # "tasks": [ + # { + # "id": "6321184690667520", + # "status": "COMPLETED" + # }, + # ... + msg "::group::Selecting failed/aborted tasks to re-run" + jq -r -e '.data.build.tasks[] | join(" ")' <<<"$task_id_status" | \ + while read -r TID STATUS; do + if [[ -z "$TID" ]] || [[ -z "$STATUS" ]]; then + # assume empty line and/or end of file + msg "Skipping TID '$TID' with status '$STATUS'" + continue + # Failed task dependencies will have 'aborted' status + elif [[ "$STATUS" == "FAILED" ]] || [[ "$STATUS" == "ABORTED" ]]; then + msg "Rerunning build $BID task $TID" + # Must send result through a file into rerun_tasks array + # because this section is executing in a child-shell + echo "$TID" >> ./artifacts/rerun_tids.txt + fi + done + declare -a rerun_tasks + mapfile rerun_tasks <./artifacts/rerun_tids.txt + msg "::endgroup::" + + if [[ "${#rerun_tasks[*]}" -eq 0 ]]; then + msg "No tasks to re-run for build $BID" + continue; + fi + + msg "::warning::Rerunning ${#rerun_tasks[*]} tasks for build $BID" + # Check-value returned if the gql call was successful + canary=$(uuidgen) + # Ensure the trailing ',' is stripped from the end (would be invalid JSON) + task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1)) + rerun_m=" + mutation { + batchReRun(input: { + clientMutationId: \"$canary\", + taskIds: $task_ids + } + ) { + clientMutationId + } + } + " + filter='.data.batchReRun.clientMutationId' + result=$(gql "$rerun_m" "$filter") + if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then + err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}" + fi + done diff --git a/.github/workflows/rerun_cirrus_cron.yml b/.github/workflows/rerun_cirrus_cron.yml new file mode 100644 index 0000000000..7fd01b071d --- /dev/null +++ b/.github/workflows/rerun_cirrus_cron.yml @@ -0,0 +1,61 @@ +--- + +# Format Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions + +# Required to un-FUBAR default ${{github.workflow}} value +name: rerun_cirrus_cron + +on: + # Note: This only applies to the main branch. + schedule: + # N/B: This should fire about an hour prior to check_cirrus_cron + # so the re-runs have a chance to complete. + - cron: '59 22 * * 1-5' + # Debug: Allow triggering job manually in github-actions WebUI + workflow_dispatch: {} + +env: + # Debug-mode can reveal secrets, only enable by a secret value. + # Ref: https://help.github.com/en/actions/configuring-and-managing-workflows/managing-a-workflow-run#enabling-step-debug-logging + ACTIONS_STEP_DEBUG: '${{ secrets.ACTIONS_STEP_DEBUG }}' + # CSV listing of e-mail addresses for delivery failure or error notices + RCPTCSV: rh.container.bot@gmail.com,podman-monitor@lists.podman.io + # Filename for table of cron-name to build-id data + # (must be in $GITHUB_WORKSPACE/artifacts/) + NAME_ID_FILEPATH: './artifacts/name_id.txt' + +permissions: + contents: read + +jobs: + cron_failures: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2 + with: + persist-credentials: false + + - name: Get failed cron names and Build IDs + id: cron + run: './.github/actions/check_cirrus_cron/cron_failures.sh' + + - if: steps.cron.outputs.failures > 0 + shell: bash + run: './.github/actions/check_cirrus_cron/rerun_failed_tasks.sh' + uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2 + with: + name: ${{ github.job }}_artifacts + path: artifacts/* + + - if: failure() + name: Send error notification e-mail + uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2 + with: + server_address: ${{secrets.ACTION_MAIL_SERVER}} + server_port: 465 + username: ${{secrets.ACTION_MAIL_USERNAME}} + password: ${{secrets.ACTION_MAIL_PASSWORD}} + subject: Github workflow error on ${{github.repository}} + to: ${{env.RCPTCSV}} + from: ${{secrets.ACTION_MAIL_SENDER}} + body: "Job failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}"