Skip to content

Commit

Permalink
Merge pull request #16414 from cevich/fix_cirrus_cron_jobs
Browse files Browse the repository at this point in the history
[CI:BUILD] Fix cirrus cirrus-cron GHA workflow scripts, add checks, and tests
  • Loading branch information
openshift-merge-robot authored Nov 10, 2022
2 parents 388b950 + 0334d8d commit 4a4d35d
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 24 deletions.
16 changes: 8 additions & 8 deletions .github/actions/check_cirrus_cron/cron_failures.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
fi

mkdir -p artifacts
confirm_gha_environment

mkdir -p ./artifacts
cat > ./artifacts/query_raw.json << "EOF"
query {
ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") {
Expand All @@ -32,22 +34,19 @@ EOF
# https://cirrus-ci.com/explorer
owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY")
repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY")
sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json

# Easier to debug in error-reply when query is compacted
tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \
jq --indent 4 --color-output .
sed -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" \
./artifacts/query_raw.json > ./artifacts/query.json

if grep -q '@@' ./artifacts/query.json; then
err "Found unreplaced substitution token in raw query JSON"
err "Found unreplaced substitution token in query JSON"
fi

# The query should never ever return an empty-list, unless there are no cirrus-cron
# jobs defined for the repository. In that case, this monitoring script shouldn't
# be running anyway.
filt_head='.data.ownerRepository.cronSettings'

gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json
gql "$(<./artifacts/query.json)" "$filt_head" > ./artifacts/reply.json
# e.x. reply.json
# {
# "data": {
Expand Down Expand Up @@ -87,5 +86,6 @@ records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1)
failures=$((records/2))
# Set the output of this step.
# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
# shellcheck disable=SC2154
echo "failures::$failures" >> $GITHUB_OUTPUT
echo "Total failed Cirrus-CI cron builds: $failures"
22 changes: 21 additions & 1 deletion .github/actions/check_cirrus_cron/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,28 @@ msg() {
# Must be called from top-level of script, not another function.
err() {
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@"
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$*"
exit 1
}

confirm_gha_environment() {
local _err_fmt
_err_fmt="I don't seem to be running from a github-actions workflow"
# These are all defined by github-actions
# shellcheck disable=SC2154
if [[ -z "$GITHUB_OUTPUT" ]]; then
err "$_err_fmt, \$GITHUB_OUTPUT is empty"
elif [[ -z "$GITHUB_WORKFLOW" ]]; then
err "$_err_fmt, \$GITHUB_WORKFLOW is empty"
elif [[ ! -d "$GITHUB_WORKSPACE" ]]; then
# Defined by github-actions
# shellcheck disable=SC2154
err "$_err_fmt, \$GITHUB_WORKSPACE='$GITHUB_WORKSPACE' isn't a directory"
fi

cd "$GITHUB_WORKSPACE" || false
}

# Using python3 here is a compromise for readability and
# properly handling quote, control and unicode character encoding.
escape_query() {
Expand Down Expand Up @@ -45,6 +63,8 @@ gql() {
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query"
return 1
fi
# SECRET_CIRRUS_API_KEY is defined github secret
# shellcheck disable=SC2154
if output=$(curl \
--request POST \
--silent \
Expand Down
8 changes: 6 additions & 2 deletions .github/actions/check_cirrus_cron/make_email_body.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ set -eo pipefail
source $(dirname "${BASH_SOURCE[0]}")/lib.sh

_errfmt="Expecting %s value to not be empty"
# NAME_ID_FILEPATH is defined by workflow YAML
# shellcheck disable=SC2154
if [[ -z "$GITHUB_REPOSITORY" ]]; then
err $(printf "$_errfmt" "\$GITHUB_REPOSITORY")
elif [[ -z "$GITHUB_WORKFLOW" ]]; then
err $(printf "$_errfmt" "\$GITHUB_WORKFLOW")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then
err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file"
fi

confirm_gha_environment

mkdir -p artifacts
(
echo "Detected one or more Cirrus-CI cron-triggered jobs have failed recently:"
Expand All @@ -27,6 +29,8 @@ mkdir -p artifacts
done < "$NAME_ID_FILEPATH"

echo ""
# Defined by github-actions
# shellcheck disable=SC2154
echo "# Source: ${GITHUB_WORKFLOW} workflow on ${GITHUB_REPOSITORY}."
# Separate content from sendgrid.com automatic footer.
echo ""
Expand Down
18 changes: 14 additions & 4 deletions .github/actions/check_cirrus_cron/rerun_failed_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ set -eo pipefail
source $(dirname "${BASH_SOURCE[0]}")/lib.sh

_errfmt="Expecting %s value to not be empty"
# NAME_ID_FILEPATH is defined by workflow YAML
# shellcheck disable=SC2154
if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then
err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh
err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH")
fi

confirm_gha_environment

mkdir -p artifacts
# If there are no tasks, don't fail reading the file
truncate -s 0 ./artifacts/rerun_tids.txt
Expand Down Expand Up @@ -92,7 +96,9 @@ cat "$NAME_ID_FILEPATH" | \
# Check-value returned if the gql call was successful
canary=$(uuidgen)
# Ensure the trailing ',' is stripped from the end (would be invalid JSON)
task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1))
# Rely on shell word-splitting in this case.
# shellcheck disable=SC2048
task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[*]} | head -c -1))
rerun_m="
mutation {
batchReRun(input: {
Expand All @@ -105,8 +111,12 @@ cat "$NAME_ID_FILEPATH" | \
}
"
filter='.data.batchReRun.clientMutationId'
result=$(gql "$rerun_m" "$filter")
if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then
err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}"
if [[ ! "$NAME" =~ "testing" ]]; then # see test.sh
result=$(gql "$rerun_m" "$filter")
if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then
err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[*]}"
fi
else
warn "Test-mode: Would have sent GraphQL request: '$rerun_m'"
fi
done
89 changes: 89 additions & 0 deletions .github/actions/check_cirrus_cron/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@


# This script attempts to confirm functional github action scripts.
# It expects to be called from Cirrus-CI, in a special execution
# enironment. Any use outside this environment will probably fail.

set -eo pipefail

# Defined by setup_environment.sh
# shellcheck disable=SC2154
if ! ((PREBUILD)); then
echo "Not operating under expected environment"
exit 1
fi

expect_regex() {
local expected_regex
local input_file
expected_regex="$1"
input_file="$2"
egrep -q "$expected_regex" $input_file || \
die "No match to '$expected_regex' in '$(<$input_file)'"
}

req_env_vars CIRRUS_CI CIRRUS_REPO_FULL_NAME CIRRUS_WORKING_DIR CIRRUS_BUILD_ID

# Defined by the CI system
# shellcheck disable=SC2154
cd $CIRRUS_WORKING_DIR || fail

header="Testing cirrus-cron github-action script:"
msg "$header cron_failures.sh"

base=$CIRRUS_WORKING_DIR/.github/actions/check_cirrus_cron
# Don't care about mktemp return value
# shellcheck disable=SC2155
export GITHUB_OUTPUT=$(mktemp -p '' cron_failures_output_XXXX)
# CIRRUS_REPO_FULL_NAME checked above in req_env_vars
# shellcheck disable=SC2154
export GITHUB_REPOSITORY="$CIRRUS_REPO_FULL_NAME"
# shellcheck disable=SC2155
export GITHUB_WORKSPACE=$(mktemp -d -p '' cron_failures_workspace_XXXX)
export GITHUB_WORKFLOW="testing"
# shellcheck disable=SC2155
export NAME_ID_FILEPATH=$(mktemp -p '' cron_failures_data_XXXX)
trap "rm -rf $GITHUB_OUTPUT $GITHUB_WORKSPACE $NAME_ID_FILEPATH" EXIT

#####

cd /tmp || fail
# Replace newlines and indentation to make egrep easier
if ! $base/cron_failures.sh |& \
tr -s '[:space:]' ' ' > $GITHUB_WORKSPACE/output; then
die "Failed: $base/cron_failures.sh with output '$(<$GITHUB_WORKSPACE/output)'"
fi

expect_regex \
'result.+data.+ownerRepository.+cronSettings.+endgroup' \
"$GITHUB_WORKSPACE/output"

#####

msg "$header make_email_body.sh"
# It's possible no cirrus-cron jobs actually failed
echo '' >> "$NAME_ID_FILEPATH"
# Don't need to test stdout/stderr of this
if ! $base/make_email_body.sh; then
die "make_email_body.sh failed"
fi

expect_regex \
'^Detected.+Cirrus-CI.+failed.*' \
"$GITHUB_WORKSPACE/artifacts/email_body.txt"

#####

msg "$header rerun_failed_tasks.sh"
export SECRET_CIRRUS_API_KEY=testing-nottherightkey
# test.sh is sensitive to the 'testing' name. Var. defined by cirrus-ci
# shellcheck disable=SC2154
echo "testing $CIRRUS_BUILD_ID" > "$NAME_ID_FILEPATH"
if ! $base/rerun_failed_tasks.sh |& \
tr -s '[:space:]' ' ' > $GITHUB_WORKSPACE/rerun_output; then
die "rerun_failed_tasks.sh failed"
fi

expect_regex \
"Posting GraphQL Query.+$CIRRUS_BUILD_ID.+Selecting.+re-run" \
"$GITHUB_WORKSPACE/rerun_output"
8 changes: 4 additions & 4 deletions .github/workflows/check_cirrus_cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
cron_failures:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
- uses: actions/checkout@v3
with:
persist-credentials: false

Expand All @@ -47,7 +47,7 @@ jobs:
- if: steps.cron.outputs.failures > 0
name: Send failure notification e-mail
# Ref: https://github.com/dawidd6/action-send-mail
uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2
uses: dawidd6/action-send-mail@v3.7.1
with:
server_address: ${{secrets.ACTION_MAIL_SERVER}}
server_port: 465
Expand All @@ -59,14 +59,14 @@ jobs:
body: file://./artifacts/email_body.txt

- if: always()
uses: actions/upload-artifact@83fd05a356d7e2593de66fc9913b3002723633cb # v3.1.1
uses: actions/upload-artifact@v3
with:
name: ${{ github.job }}_artifacts
path: artifacts/*

- if: failure()
name: Send error notification e-mail
uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2
uses: dawidd6/action-send-mail@3.7.1
with:
server_address: ${{secrets.ACTION_MAIL_SERVER}}
server_port: 465
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/rerun_cirrus_cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ on:
# Debug: Allow triggering job manually in github-actions WebUI
workflow_dispatch: {}


env:
# Debug-mode can reveal secrets, only enable by a secret value.
# Ref: https://help.github.com/en/actions/configuring-and-managing-workflows/managing-a-workflow-run#enabling-step-debug-logging
Expand All @@ -28,10 +29,10 @@ permissions:
contents: read

jobs:
cron_failures:
cron_rerun:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2
- uses: actions/checkout@v3
with:
persist-credentials: false

Expand All @@ -42,14 +43,15 @@ jobs:
- if: steps.cron.outputs.failures > 0
shell: bash
run: './.github/actions/check_cirrus_cron/rerun_failed_tasks.sh'
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2

- uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2
with:
name: ${{ github.job }}_artifacts
path: artifacts/*

- if: failure()
name: Send error notification e-mail
uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2
uses: dawidd6/action-send-mail@v3.7.1
with:
server_address: ${{secrets.ACTION_MAIL_SERVER}}
server_port: 465
Expand Down
11 changes: 10 additions & 1 deletion contrib/cirrus/prebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ set -eo pipefail
# prevent wasting time on tests that can't succeed due to some
# outage, failure, or missed expectation.

set -a
source /etc/automation_environment
source $AUTOMATION_LIB_PATH/common_lib.sh
set +a

req_env_vars CI DEST_BRANCH IMAGE_SUFFIX TEST_FLAVOR TEST_ENVIRON \
PODBIN_NAME PRIV_NAME DISTRO_NV AUTOMATION_LIB_PATH \
Expand All @@ -21,20 +23,27 @@ req_env_vars CI DEST_BRANCH IMAGE_SUFFIX TEST_FLAVOR TEST_ENVIRON \
# shellcheck disable=SC2154
cd $CIRRUS_WORKING_DIR

msg "Checking Cirrus YAML"
# Defined by CI config.
# shellcheck disable=SC2154
showrun $SCRIPT_BASE/cirrus_yaml_test.py

# Defined by CI config.
# shellcheck disable=SC2154
if [[ "${DISTRO_NV}" =~ fedora ]]; then
msg "Checking shell scripts"
showrun ooe.sh dnf install -y ShellCheck # small/quick addition
showrun shellcheck --color=always --format=tty \
--shell=bash --external-sources \
--enable add-default-case,avoid-nullary-conditions,check-unassigned-uppercase \
--exclude SC2046,SC2034,SC2090,SC2064 \
--wiki-link-count=0 --severity=warning \
$SCRIPT_BASE/*.sh hack/get_ci_vm.sh
$SCRIPT_BASE/*.sh \
./.github/actions/check_cirrus_cron/* \
hack/get_ci_vm.sh

export PREBUILD=1
showrun bash ${CIRRUS_WORKING_DIR}/.github/actions/check_cirrus_cron/test.sh
fi

msg "Checking 3rd party network service connectivity"
Expand Down

0 comments on commit 4a4d35d

Please sign in to comment.