Skip to content

Commit

Permalink
Merge pull request #16357 from cevich/cron_auto_rerun
Browse files Browse the repository at this point in the history
[CI:DOCS] [WIP] GHA: Auto. re-run failed cirrus-cron builds once
  • Loading branch information
openshift-merge-robot authored Nov 3, 2022
2 parents 6428ff1 + 35523d5 commit eb7da14
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 70 deletions.
71 changes: 20 additions & 51 deletions .github/actions/check_cirrus_cron/cron_failures.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,68 +5,49 @@ set -eo pipefail
# Intended to be executed from a github action workflow step.
# Outputs the Cirrus cron names and IDs of any failed builds

err() {
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions
echo "::error file=${BASH_SOURCE[0]},line=${BASH_LINENO[0]}::${1:-No error message given}"
exit 1
}
source $(dirname "${BASH_SOURCE[0]}")/lib.sh

_errfmt="Expecting %s value to not be empty"
if [[ -z "$GITHUB_REPOSITORY" ]]; then
if [[ -z "$GITHUB_REPOSITORY" ]]; then # <owner>/<repo>
err $(printf "$_errfmt" "\$GITHUB_REPOSITORY")
elif [[ -z "$NAME_ID_FILEPATH" ]]; then
elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
fi

mkdir -p artifacts
cat > ./artifacts/query_raw.json << "EOF"
{"query":"
query CronNameStatus($owner: String!, $repo: String!) {
ownerRepository(platform: \"LINUX\", owner: $owner, name: $repo) {
cronSettings {
name
lastInvocationBuild {
id
status
}
query {
ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") {
cronSettings {
name
lastInvocationBuild {
id
status
}
}
}
",
"variables":"{
\"owner\": \"@@OWNER@@\",
\"repo\": \"@@REPO@@\"
}"}
}
EOF
# Makes for easier copy/pasting query to/from
# https://cirrus-ci.com/explorer
owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY")
repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY")
sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json

echo "::group::Posting GraphQL Query"
# Easier to debug in error-reply when query is compacted
tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \
jq --indent 4 --color-output .

if grep -q '@@' ./artifacts/query.json; then
err "Found unreplaced substitution token in raw query JSON"
fi
curl \
--request POST \
--silent \
--location \
--header 'content-type: application/json' \
--url 'https://api.cirrus-ci.com/graphql' \
--data @./artifacts/query.json \
--output ./artifacts/reply.json
echo "::endgroup::"

echo "::group::Received GraphQL Reply"
jq --indent 4 --color-output . <./artifacts/reply.json || \
cat ./artifacts/reply.json
echo "::endgroup::"
# The query should never ever return an empty-list, unless there are no cirrus-cron
# jobs defined for the repository. In that case, this monitoring script shouldn't
# be running anyway.
filt_head='.data.ownerRepository.cronSettings'

gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json
# e.x. reply.json
# {
# "data": {
Expand All @@ -91,22 +72,8 @@ echo "::endgroup::"
# "lastInvocationBuild": {
# "id": "5003065549914112",
# "status": "FAILED"
# }
# }
# ]
# }
# }
# }

# This should never ever return an empty-list, unless there are no cirrus-cron
# jobs defined for the repository. In that case, this monitoring script shouldn't
# be running anyway.
filt_head='.data.ownerRepository.cronSettings'
if ! jq -e "$filt_head" ./artifacts/reply.json &> /dev/null
then
# Actual colorized JSON reply was printed above
err "Null/empty result filtering reply with '$filt_head'"
fi
# ...

filt="$filt_head | map(select(.lastInvocationBuild.status==\"FAILED\") | { name:.name, id:.lastInvocationBuild.id} | join(\" \")) | join(\"\n\")"
jq --raw-output "$filt" ./artifacts/reply.json > "$NAME_ID_FILEPATH"
Expand All @@ -118,5 +85,7 @@ cat "$NAME_ID_FILEPATH"
records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1)
# Always two words per record
failures=$((records/2))
echo "::set-output name=failures::$failures"
# Set the output of this step.
# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
echo "failures::$failures" >> $GITHUB_OUTPUT
echo "Total failed Cirrus-CI cron builds: $failures"
75 changes: 75 additions & 0 deletions .github/actions/check_cirrus_cron/lib.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@


# Send text to stderr
msg() {
echo "$@" > /dev/stderr
}

# Must be called from top-level of script, not another function.
err() {
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@"
exit 1
}

# Using python3 here is a compromise for readability and
# properly handling quote, control and unicode character encoding.
escape_query() {
local json_string
# Assume it's okay to squash repeated whitespaces inside the query
json_string=$(printf '%s' "$1" | \
tr --delete '\r\n' | \
tr --squeeze-repeats '[[:space:]]' | \
python3 -c 'import sys,json; print(json.dumps(sys.stdin.read()))')
# The $json_string in message is already quoted
echo -n "$json_string"
}

# Given a GraphQL query/mutation, fire it at the API.
# and return the output on stdout. The optional
# second parameter may contain a jq filter-string.
# When provided, if the GQL result is empty, null,
# fails to parse, or does not match the filter-string,
# non-zero will be returned.
gql() {
local e_query query
e_query=$(escape_query "$1")
query="{\"query\": $e_query}"
local filter
filter="$2"
local output
local filtered
msg "::group::Posting GraphQL Query and checking result"
msg "query: "
if ! jq -e . <<<"$query" > /dev/stderr; then
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query"
return 1
fi
if output=$(curl \
--request POST \
--silent \
--show-error \
--location \
--header 'content-type: application/json' \
--header "Authorization: Bearer $SECRET_CIRRUS_API_KEY" \
--url 'https://api.cirrus-ci.com/graphql' \
--data "$query") && [[ -n "$output" ]]; then

if filtered=$(jq -e "$filter" <<<"$output") && [[ -n "$filtered" ]]; then
msg "result:"
# Make debugging easier w/ formatted output
# to stderr for display, stdout for consumption by caller
jq --indent 2 . <<<"$output" | tee /dev/stderr
msg "::endgroup::"
return 0
fi

msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query result did not pass filter '$2': '$output'"
msg "::endgroup::"
return 2
fi

msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query failed or result empty: '$output'"
msg "::endgroup::"
return 3
}
34 changes: 34 additions & 0 deletions .github/actions/check_cirrus_cron/make_email_body.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

set -eo pipefail

# Intended to be executed from a github action workflow step.
# Input: File listing space separated failed cron build names and IDs
# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file

source $(dirname "${BASH_SOURCE[0]}")/lib.sh

_errfmt="Expecting %s value to not be empty"
if [[ -z "$GITHUB_REPOSITORY" ]]; then
err $(printf "$_errfmt" "\$GITHUB_REPOSITORY")
elif [[ -z "$GITHUB_WORKFLOW" ]]; then
err $(printf "$_errfmt" "\$GITHUB_WORKFLOW")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then
err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file"
fi

mkdir -p artifacts
(
echo "Detected one or more Cirrus-CI cron-triggered jobs have failed recently:"
echo ""

while read -r NAME BID; do
echo "Cron build '$NAME' Failed: https://cirrus-ci.com/build/$BID"
done < "$NAME_ID_FILEPATH"

echo ""
echo "# Source: ${GITHUB_WORKFLOW} workflow on ${GITHUB_REPOSITORY}."
# Separate content from sendgrid.com automatic footer.
echo ""
echo ""
) > ./artifacts/email_body.txt
112 changes: 112 additions & 0 deletions .github/actions/check_cirrus_cron/rerun_failed_tasks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash

set -eo pipefail

# Intended to be executed from a github action workflow step.
# Input: File listing space separated failed cron build names and IDs
# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file
#
# HOW TO TEST: This script may be manually tested assuming you have
# access to the github containers-org. Cirrus API key. With that in-hand,
# this script may be manually run by:
# 1. export SECRET_CIRRUS_API_KEY=<value>
# 2. Find an old podman build that failed on `main` or another **branch**.
# For example, from https://cirrus-ci.com/github/containers/podman/main
# (pick an old one from the bottom, since re-running it won't affect anybody)
# 3. Create a temp. file, like /tmp/fail with a single line, of the form:
# <branch> <cirrus build id number>
# 4. export NAME_ID_FILEPATH=/tmp/fail
# 5. execute this script, and refresh the build in the WebUI, all unsuccessful
# tasks should change status to running or scheduled. Note: some later
# tasks may remain red as they wait for dependencies to run and pass.
# 6. After each run, cleanup with 'rm -rf ./artifacts'
# (unless you want to examine them)

source $(dirname "${BASH_SOURCE[0]}")/lib.sh

_errfmt="Expecting %s value to not be empty"
if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then
err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh
err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH")
fi

mkdir -p artifacts
# If there are no tasks, don't fail reading the file
truncate -s 0 ./artifacts/rerun_tids.txt

cat "$NAME_ID_FILEPATH" | \
while read -r NAME BID; do
if [[ -z "$NAME" ]]; then
err $(printf "$_errfmt" "\$NAME")
elif [[ -z "$BID" ]]; then
err $(printf "$_errfmt" "\$BID")
fi

id_status_q="
query {
build(id: \"$BID\") {
tasks {
id,
status
}
}
}
"
task_id_status=$(gql "$id_status_q" '.data.build.tasks[0]')
# Expected query result like:
# {
# "data": {
# "build": {
# "tasks": [
# {
# "id": "6321184690667520",
# "status": "COMPLETED"
# },
# ...
msg "::group::Selecting failed/aborted tasks to re-run"
jq -r -e '.data.build.tasks[] | join(" ")' <<<"$task_id_status" | \
while read -r TID STATUS; do
if [[ -z "$TID" ]] || [[ -z "$STATUS" ]]; then
# assume empty line and/or end of file
msg "Skipping TID '$TID' with status '$STATUS'"
continue
# Failed task dependencies will have 'aborted' status
elif [[ "$STATUS" == "FAILED" ]] || [[ "$STATUS" == "ABORTED" ]]; then
msg "Rerunning build $BID task $TID"
# Must send result through a file into rerun_tasks array
# because this section is executing in a child-shell
echo "$TID" >> ./artifacts/rerun_tids.txt
fi
done
declare -a rerun_tasks
mapfile rerun_tasks <./artifacts/rerun_tids.txt
msg "::endgroup::"

if [[ "${#rerun_tasks[*]}" -eq 0 ]]; then
msg "No tasks to re-run for build $BID"
continue;
fi

msg "::warning::Rerunning ${#rerun_tasks[*]} tasks for build $BID"
# Check-value returned if the gql call was successful
canary=$(uuidgen)
# Ensure the trailing ',' is stripped from the end (would be invalid JSON)
task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1))
rerun_m="
mutation {
batchReRun(input: {
clientMutationId: \"$canary\",
taskIds: $task_ids
}
) {
clientMutationId
}
}
"
filter='.data.batchReRun.clientMutationId'
result=$(gql "$rerun_m" "$filter")
if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then
err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}"
fi
done
21 changes: 2 additions & 19 deletions .github/workflows/check_cirrus_cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,28 +38,11 @@ jobs:

- name: Get failed cron names and Build IDs
id: cron
run: './.github/actions/${{ github.workflow }}/${{ github.job }}.sh'
run: './.github/actions/check_cirrus_cron/cron_failures.sh'

- if: steps.cron.outputs.failures > 0
shell: bash
# Must be inline, since context expressions are used.
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/context-and-expression-syntax-for-github-actions
run: |
set -eo pipefail
(
echo "Detected one or more Cirrus-CI cron-triggered jobs have failed recently:"
echo ""
while read -r NAME BID; do
echo "Cron build '$NAME' Failed: https://cirrus-ci.com/build/$BID"
done < "$NAME_ID_FILEPATH"
echo ""
echo "# Source: ${{ github.workflow }} workflow on ${{ github.repository }}."
# Separate content from sendgrid.com automatic footer.
echo ""
echo ""
) > ./artifacts/email_body.txt
run: './.github/actions/check_cirrus_cron/make_email_body'

- if: steps.cron.outputs.failures > 0
name: Send failure notification e-mail
Expand Down
Loading

0 comments on commit eb7da14

Please sign in to comment.