From d425605fe7607d0e9b08a3b268c63ed38c0ea382 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Tue, 10 Dec 2024 13:36:24 -0800 Subject: [PATCH 1/4] Convert alerting into a reusable workflow. --- .github/workflows/ci.yml | 35 ++---------- .github/workflows/workflow_summary.yml | 62 ++++++++++++++++++++++ build_tools/github_actions/configure_ci.py | 12 +++-- 3 files changed, 76 insertions(+), 33 deletions(-) create mode 100644 .github/workflows/workflow_summary.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 985ed44b9176..226e09d08ba8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -198,39 +198,14 @@ jobs: ############################################################################## - # Depends on all the other jobs to provide a single anchor that indicates the - # final status. Status reporting will become more sophisticated in the future - # and we can hopefully avoid the need to explicitly list every single job... - summary: - # Even if you have an explicit if condition, you still need to override - # GitHub's default behavior of not running if any dependencies failed. + # Aggregate job status and alerting on failures. + ci_summary: if: always() - runs-on: ubuntu-20.04 needs: - setup - runtime - runtime_small - runtime_tracing - steps: - - name: "Checking out repository" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Getting failed jobs - id: failed_jobs - run: | - echo '${{ toJson(needs) }}' - FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ - | jq --raw-output \ - 'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \ - )" - echo "failed-jobs=${FAILED_JOBS}" >> $GITHUB_OUTPUT - if [[ "${FAILED_JOBS}" != "" ]]; then - echo "The following jobs failed: ${FAILED_JOBS}" - exit 1 - fi - - name: Posting to Discord - uses: sarisia/actions-status-discord@ce8cc68e4e626000136b3c702d049a154243e490 # v1.14.7 - if: failure() && github.ref_name == 'main' - with: - webhook: ${{ secrets.DISCORD_WEBHOOK }} - description: "The following jobs failed: ${{ steps.failed_jobs.outputs.failed-jobs }}" - url: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}" + uses: ./.github/workflows/workflow_summary.yml + with: + jobs-json: ${{ toJson(needs) }} diff --git a/.github/workflows/workflow_summary.yml b/.github/workflows/workflow_summary.yml new file mode 100644 index 000000000000..de19634ff2cd --- /dev/null +++ b/.github/workflows/workflow_summary.yml @@ -0,0 +1,62 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Checks the result status of each job provided by 'jobs-json' and sends an +# alert if at least one job failed. +# +# Usage: +# ```yml +# jobs: +# job_1: +# ... +# job_2: +# ... +# my_summary: +# if: always() +# needs: +# - job_1 +# - job_2 +# uses: ./.github/workflows/workflow_summary.yml +# with: +# jobs-json: ${{ toJson(needs) }} +# ``` + +name: Workflow Summary + +on: + workflow_call: + inputs: + jobs-json: + type: string + description: The output of `toJson(needs)` + +permissions: + contents: read + +jobs: + summary: + runs-on: ubuntu-20.04 + steps: + - name: Getting failed jobs + id: failed_jobs + run: | + echo '${{ inputs.jobs-json }}' + FAILED_JOBS="$(echo '${{ inputs.jobs-json }}' \ + | jq --raw-output \ + 'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \ + )" + echo "failed-jobs=${FAILED_JOBS}" >> $GITHUB_OUTPUT + if [[ "${FAILED_JOBS}" != "" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + fi + - name: Posting to Discord + uses: sarisia/actions-status-discord@ce8cc68e4e626000136b3c702d049a154243e490 # v1.14.7 + if: failure() && github.ref_name == 'main' && github.repository_owner == 'iree-org' + with: + webhook: ${{ secrets.DISCORD_WEBHOOK }} + description: "The following jobs failed: ${{ steps.failed_jobs.outputs.failed-jobs }}" + url: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}" diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py index 55247e9c7b3c..0f991890ce81 100755 --- a/build_tools/github_actions/configure_ci.py +++ b/build_tools/github_actions/configure_ci.py @@ -39,8 +39,8 @@ import fnmatch import json import os -import re import pathlib +import re import string import subprocess import sys @@ -111,7 +111,12 @@ def contains(cls, val): RUNNER_ENV_DEFAULT = "prod" RUNNER_ENV_OPTIONS = [RUNNER_ENV_DEFAULT, "testing"] -CONTROL_JOBS = frozenset(["setup", "summary"]) +CONTROL_JOB_REGEXES = frozenset( + [ + re.compile("setup"), + re.compile(".*summary"), + ] +) # Jobs to run only on postsubmit by default. # They may also run on presubmit only under certain conditions. @@ -380,7 +385,8 @@ def parse_jobs_from_workflow_file(workflow_file: pathlib.Path) -> Set[str]: workflow = yaml.load(workflow_file.read_text(), Loader=yaml.SafeLoader) all_jobs = set(workflow["jobs"].keys()) - all_jobs -= CONTROL_JOBS + for regex in CONTROL_JOB_REGEXES: + all_jobs = [j for j in all_jobs if not regex.match(j)] if ALL_KEY in all_jobs: raise ValueError(f"Workflow has job with reserved name '{ALL_KEY}'") From aef5a069db43234f6de254d6f38181a5372f6fe3 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Tue, 10 Dec 2024 13:49:48 -0800 Subject: [PATCH 2/4] Use workflow_summary in pkgci.yml. --- .github/workflows/pkgci.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml index 78e03a9644cf..cc2691c5b1b1 100644 --- a/.github/workflows/pkgci.yml +++ b/.github/workflows/pkgci.yml @@ -121,3 +121,27 @@ jobs: needs: [setup, build_packages] if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_pjrt') uses: ./.github/workflows/pkgci_test_pjrt.yml + + ############################################################################## + + # Aggregate job status and alerting on failures. + pkgci_summary: + if: always() + needs: + - setup + - build_packages + - unit_test + - regression_test + - test_amd_mi250 + - test_amd_mi300 + - test_amd_w7900 + # - test_nvidia_t4 + - test_android + - test_riscv64 + - test_onnx + - test_sharktank + - test_tensorflow + - test_pjrt + uses: ./.github/workflows/workflow_summary.yml + with: + jobs-json: ${{ toJson(needs) }} From 2039049dbdfdb930ec730462736d113dfbb1c9d2 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Tue, 10 Dec 2024 14:02:37 -0800 Subject: [PATCH 3/4] HACK: test alerting --- .github/workflows/workflow_summary.yml | 4 +++- runtime/src/iree/base/api.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/workflow_summary.yml b/.github/workflows/workflow_summary.yml index de19634ff2cd..66789ff82675 100644 --- a/.github/workflows/workflow_summary.yml +++ b/.github/workflows/workflow_summary.yml @@ -55,7 +55,9 @@ jobs: fi - name: Posting to Discord uses: sarisia/actions-status-discord@ce8cc68e4e626000136b3c702d049a154243e490 # v1.14.7 - if: failure() && github.ref_name == 'main' && github.repository_owner == 'iree-org' + # if: failure() && github.ref_name == 'main' && github.repository_owner == 'iree-org' + # DO NOT SUBMIT + if: failure() && github.repository_owner == 'iree-org' with: webhook: ${{ secrets.DISCORD_WEBHOOK }} description: "The following jobs failed: ${{ steps.failed_jobs.outputs.failed-jobs }}" diff --git a/runtime/src/iree/base/api.h b/runtime/src/iree/base/api.h index 733c74e6515c..9945e03eb52c 100644 --- a/runtime/src/iree/base/api.h +++ b/runtime/src/iree/base/api.h @@ -93,6 +93,8 @@ #include "iree/base/tracing.h" // IWYU pragma: export #include "iree/base/wait_source.h" // IWYU pragma: export +#error "DO NOT SUBMIT - testing job failure alerting" + #ifdef __cplusplus extern "C" { #endif // __cplusplus From 1c40e46505129338b802224b05c64d7c2394666d Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Tue, 10 Dec 2024 14:12:55 -0800 Subject: [PATCH 4/4] Um. Use a set comprehension instead of a list comprehension? --- build_tools/github_actions/configure_ci.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py index 0f991890ce81..8144aa882ac3 100755 --- a/build_tools/github_actions/configure_ci.py +++ b/build_tools/github_actions/configure_ci.py @@ -386,7 +386,7 @@ def parse_jobs_from_workflow_file(workflow_file: pathlib.Path) -> Set[str]: workflow = yaml.load(workflow_file.read_text(), Loader=yaml.SafeLoader) all_jobs = set(workflow["jobs"].keys()) for regex in CONTROL_JOB_REGEXES: - all_jobs = [j for j in all_jobs if not regex.match(j)] + all_jobs = {j for j in all_jobs if not regex.match(j)} if ALL_KEY in all_jobs: raise ValueError(f"Workflow has job with reserved name '{ALL_KEY}'")