.github/workflows/deploy-hubs.yaml

# The use of ::set-output in this workflow file was deprecated as per:
# https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
# We have instead pivoted to writing to the GITHUB_ENV file, either by the
# method suggested in the above blog post, or by following the below Stack
# Overflow answer for python code: https://stackoverflow.com/a/70123641
# More info on GITHUB_ENV: https://docs.github.com/en/actions/learn-github-actions/environment-variables

name: Deploy and test hubs

on:
  push:
    branches:
      - main
    paths:
      # Include changes to the deployer script's folder, but exclude some parts
      - deployer/**
      - "!deployer/README.md"
      - "!deployer/health_check_tests/**"
      - "!deployer/commands/generate/billing/**"
      - "!deployer/commands/generate/dedicated_cluster/**"
      - "!deployer/commands/generate/hub_asset/**"
      - "!deployer/commands/generate/resource_allocation/**"
      - requirements.txt
      - .github/actions/setup-deploy/**
      - helm-charts/**
      - config/clusters/**
      # Exclude the template configuration files
      - "!config/clusters/templates"
  pull_request:
    branches:
      - main
    paths:
      # Include changes to the deployer script's folder, but exclude some parts
      - deployer/**
      - "!deployer/README.md"
      - "!deployer/health_check_tests/**"
      - "!deployer/commands/generate/billing/**"
      - "!deployer/commands/generate/dedicated_cluster/**"
      - "!deployer/commands/generate/hub_asset/**"
      - "!deployer/commands/generate/resource_allocation/**"
      - requirements.txt
      - .github/actions/setup-deploy/**
      - helm-charts/**
      - config/clusters/**
      # Exclude the template configuration files
      - "!config/clusters/templates"

# When multiple PRs triggering this workflow are merged, queue them instead
# of running them in parallel
# https://github.blog/changelog/2021-04-19-github-actions-limit-workflow-run-or-job-concurrency/
concurrency: deploy

# This environment variable triggers the deployer to colourise print statements in the
# GitHug Actions logs for easy reading
env:
  TERM: xterm
  # This may not be required any more, and it may depend on the kubectl version
  # we use etc as well. For now, we have it added to avoid issues.
  USE_GKE_GCLOUD_AUTH_PLUGIN: "True"

jobs:
  # This job runs in Pull Requests and on pushes to the default branch. It identifies
  # which files have been added or modified by recent GitHub activity and parsed a list
  # to the `deployer generate helm-upgrade-job`s command of the deployer. This command generates
  # two lists of dictionaries, which can be read by GitHub Actions as matrix jobs. The
  # first set of jobs describes which clusters need their support chart and/or staging
  # hub upgraded; and the second set of jobs describe which production hubs require
  # upgrading. These two lists are set as job outputs via GITHUB_ENV to be consumed
  # by the later jobs. They are also pretty-printed in a human-readable format to the
  # logs, and converted into Markdown tables for posting into GitHub comments.
  generate-jobs:
    runs-on: ubuntu-latest
    outputs:
      support-and-staging-matrix-jobs: ${{ env.support-and-staging-matrix-jobs }}
      prod-hub-matrix-jobs: ${{ env.prod-hub-matrix-jobs }}

    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      # There will almost never be a cache hit on the cache key when this job is
      # run, as it is the first of all jobs in this workflow. An exception is if
      # this job is re-attempted as part of the same workflow run after
      # succeeding previously.
      - name: Save pip's install cache on job completion
        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          # key determines if we define or reuse an existing cache or not. Our
          # key ensure we cache within a workflow run and its attempts, but not
          # between workflow runs.
          key: "${{ github.run_id }}"

      - name: Install deployer script's Python dependencies
        run: |
          pip install --editable .
          pip list

      - name: Identify files that have been added or modified
        # Action repo: https://github.com/dorny/paths-filter
        uses: dorny/paths-filter@v3
        id: changed-files
        with:
          token: ""
          list-files: csv
          filters: |
            changed:
              - added|modified: deployer/**
              - added|modified: requirements.txt
              - added|modified: .github/actions/setup-deploy/**
              - added|modified: helm-charts/basehub/**
              - added|modified: helm-charts/daskhub/**
              - added|modified: helm-charts/support/**
              - added|modified: config/clusters/**

      # This step will create a comment-body.txt file containing the jobs to be run in a
      # Markdown table format to be posted on a Pull Request, if this job is triggered
      # by one
      - name: Generate matrix jobs
        run: |
          deployer generate helm-upgrade-jobs "${{ steps.changed-files.outputs.changed_files }}"

      # The comment-deployment-plan-pr.yaml workflow won't have the correct context to
      # know the PR number, so we save it to a file to pass to that workflow
      - name: Save Pull Request number to a file
        if: github.event_name == 'pull_request'
        run: |
          echo "${{ github.event.number }}" > pr-number.txt

      # Upload the pr-number.txt and comment-body.txt files as artifacts for the
      # comment-deployment-plan-pr.yaml workflow to access
      - name: Upload artifacts
        # Only run this steps in PRs when the matrices are not empty
        if: >
          github.event_name == 'pull_request' &&
          (env.support-and-staging-matrix-jobs != '[]' ||
          env.prod-hub-matrix-jobs != '[]')
        uses: actions/upload-artifact@v4
        with:
          name: pr
          path: |
            pr-number.txt
            comment-body.txt

      # https://github.com/ravsamhq/notify-slack-action
      # Needs to be added per job
      # When https://github.com/integrations/slack/issues/1563 gets implemented,
      # we can use that instead
      - name: Report Status
        if: always()
        uses: ravsamhq/notify-slack-action@v2
        with:
          # Warning: there are multiple "Report Status" steps in this file (one per job).
          # Make sure they are all updated
          notify_when: "failure"
          status: ${{ job.status }} # required
          # Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
          message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
          footer: "<{run_url}|Failing Run>"
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}
  # This job upgrades the support chart, staging hub, and dask-staging hub (if present)
  # for clusters in parallel, if those upgrades are required. This job needs the
  # `generate-jobs` job to have completed and set an output to the
  # `support-and-staging-matrix-jobs` variable name. It's inputs are a list of
  # dictionaries with the keys cluster_name, provider, upgrade_support, and
  # upgrade_staging for each cluster that requires it.
  upgrade-support-and-staging:
    runs-on: ubuntu-latest
    needs: [generate-jobs]

    # We declare outputs indicating the job failed status of a specific job
    # variation. We are currently required to do this in a hardcoded fashion,
    # see this post for feature requests for this to be improved:
    # https://github.community/t/bug-jobs-output-should-return-a-list-for-a-matrix-job/128626/32?u=consideratio
    #
    # Warning: names can include alphanumerics, '-', and '_', but not '.', so
    #            we replace '.' for '-' in cluster names.
    #
    # If you are adding a new cluster, please remember to list it here!
    outputs:
      failure_2i2c-aws-us: "${{ env.failure_2i2c-aws-us }}"
      failure_2i2c-uk: "${{ env.failure_2i2c-uk }}"
      failure_2i2c: "${{ env.failure_2i2c }}"
      failure_awi-ciroh: "${{ env.failure_awi-ciroh }}"
      failure_bican: "${{ env.failure_bican }}"
      failure_catalystproject-africa: "${{ env.failure_catalystproject-africa }}"
      failure_catalystproject-latam: "${{ env.failure_catalystproject-latam }}"
      failure_cloudbank: "${{ env.failure_cloudbank }}"
      failure_dandi: "${{ env.failure_dandi }}"
      failure_earthscope: "${{ env.failure_earthscope }}"
      failure_gridsst: "${{ env.failure_gridsst }}"
      failure_hhmi: "${{ env.failure_hhmi }}"
      failure_jupyter-health: "${{ env.failure_jupyter-health }}"
      failure_jupyter-meets-the-earth: "${{ env.failure_jupyter-meets-the-earth }}"
      failure_kitware: "${{ env.failure_kitware }}"
      failure_leap: "${{ env.failure_leap }}"
      failure_linc: "${{ env.failure_linc }}"
      failure_linked-earth: "${{ env.failure_linked-earth }}"
      failure_meom-ige: "${{ env.failure_meom-ige }}"
      failure_nasa-cryo: "${{ env.failure_nasa-cryo }}"
      failure_nasa-esdis: "${{ env.failure_nasa-esdis }}"
      failure_nasa-ghg: "${{ env.failure_nasa-ghg }}"
      failure_nasa-veda: "${{ env.failure_nasa-veda }}"
      failure_openscapes: "${{ env.failure_openscapes }}"
      failure_opensci: "${{ env.failure_opensci }}"
      failure_pangeo-hubs: "${{ env.failure_pangeo-hubs }}"
      failure_qcl: "${{ env.failure_qcl }}"
      failure_smithsonian: "${{ env.failure_smithsonian }}"
      failure_ubc-eoas: "${{ env.failure_ubc-eoas }}"
      failure_utoronto: "${{ env.failure_utoronto }}"
      failure_victor: "${{ env.failure_victor }}"

    # Only run this job on pushes to the default branch and when the job output is not
    # an empty list
    if: |
      (github.event_name == 'push' && contains(github.ref, 'main')) &&
      needs.generate-jobs.outputs.support-and-staging-matrix-jobs != '[]'
    strategy:
      # Don't stop other deployments if one fails
      fail-fast: false
      matrix:
        jobs: ${{ fromJson(needs.generate-jobs.outputs.support-and-staging-matrix-jobs) }}

    steps:
      - uses: actions/checkout@v4

      - name: Setup deploy for ${{ matrix.jobs.cluster_name }}
        uses: ./.github/actions/setup-deploy
        with:
          provider: ${{ matrix.jobs.provider }}
          GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}

      - name: Upgrade support chart on cluster ${{ matrix.jobs.cluster_name }}
        if: matrix.jobs.upgrade_support
        run: |
          deployer deploy-support ${{ matrix.jobs.cluster_name }}

      - name: Upgrade staging hub on cluster ${{ matrix.jobs.cluster_name }}
        if: matrix.jobs.upgrade_staging
        run: |
          deployer deploy ${{ matrix.jobs.cluster_name }} staging

      # Retry action: https://github.com/marketplace/actions/retry-step
      - name: Run health check for staging hub on cluster ${{ matrix.jobs.cluster_name }}
        if: matrix.jobs.upgrade_staging
        uses: nick-fields/retry@v3
        with:
          timeout_minutes: 10
          max_attempts: 2
          command: |
            deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} staging

      - name: Upgrade dask-staging hub on cluster ${{ matrix.jobs.cluster_name }} if it exists
        if: matrix.jobs.upgrade_staging && matrix.jobs.cluster_name == '2i2c'
        run: |
          deployer deploy ${{ matrix.jobs.cluster_name }} dask-staging

      - name: Upgrade binder-staging hub on cluster ${{ matrix.jobs.cluster_name }} if it exists
        if: matrix.jobs.upgrade_staging && matrix.jobs.cluster_name == '2i2c'
        run: |
          deployer deploy ${{ matrix.jobs.cluster_name }} binder-staging

      # Retry action: https://github.com/marketplace/actions/retry-step
      - name: Run health check for dask-staging hub on cluster ${{ matrix.jobs.cluster_name }} if it exists
        if: matrix.jobs.upgrade_staging && matrix.jobs.cluster_name == '2i2c'
        uses: nick-fields/retry@v3
        with:
          timeout_minutes: 10
          max_attempts: 2
          command: |
            deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} dask-staging

      - name: Declare failure status
        if: always()
        shell: python
        run: |
          import os

          name = "${{ matrix.jobs.cluster_name }}".replace(".", "-")
          failure = "${{ job.status == 'failure' }}"

          env_file = os.getenv("GITHUB_ENV")
          with open(env_file, "a") as f:
              f.write(f"failure_{name}={failure}")

      # https://github.com/ravsamhq/notify-slack-action
      # Needs to be added per job
      # FIXME: when https://github.com/integrations/slack/issues/1563 gets implemented,
      # we can use that instead
      - name: Report Status
        if: always()
        uses: ravsamhq/notify-slack-action@v2
        # Warning: there are multiple "Report Status" steps in this file (one per job).
        # Make sure they are all updated
        with:
          notify_when: "failure"
          status: ${{ job.status }} # required
          # Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
          message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
          footer: "<{run_url}|Failing Run>"
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}

  # This jobs reduces the initially planned prod-hub-matrix-jobs deployments by
  # filtering out any deployment to a cluster with a failed support-and-staging
  # job.
  filter-generate-jobs:
    runs-on: ubuntu-latest
    needs: [generate-jobs, upgrade-support-and-staging]
    # Only run this job on pushes to the default branch and when the job output is not
    # an empty list
    #
    # always() is added as it seems to be a magic function to ensure a job can
    # run at all without being skipped if a previous job has had any failures.
    # https://docs.github.com/en/actions/learn-github-actions/expressions#always
    if: |
      always() &&
      (github.event_name == 'push' && contains(github.ref, 'main')) &&
      needs.generate-jobs.outputs.prod-hub-matrix-jobs != '[]'

    outputs:
      filtered-prod-hub-matrix-jobs: ${{ env.filtered-prod-hub-matrix-jobs }}

    steps:
      # This Python script filters out any prod hub deployment job from running
      # later based on if its part of a cluster where support/staging upgrade
      # just failed. Data is injected to the script before its executed via
      # string literals as rendered GitHub workflow expressions.
      - name: Filter prod deploy jobs to run based on failures in support/staging
        shell: python
        run: |
          import os
          import json

          jobs = json.loads(r"""${{ needs.generate-jobs.outputs.prod-hub-matrix-jobs }}""")
          outputs = json.loads(r"""${{ toJson(needs.upgrade-support-and-staging.outputs) }}""")

          try:
            filtered_jobs = [
                job
                for job in jobs
                if outputs[f"failure_{job['cluster_name'].replace('.', '-')}"] != "true"
            ]
          except KeyError:
            print(f"The {cluster_name} cluster wasn't found in the `upgrade-support-and-staging.outputs` list. Please add it before continuing!")

          env_file = os.getenv("GITHUB_ENV")
          with open(env_file, "a") as f:
              f.write(f"filtered-prod-hub-matrix-jobs={json.dumps(filtered_jobs)}")

      # https://github.com/ravsamhq/notify-slack-action
      # Needs to be added per job
      # When https://github.com/integrations/slack/issues/1563 gets implemented,
      # we can use that instead
      - name: Report Status
        if: always()
        uses: ravsamhq/notify-slack-action@v2
        with:
          # Warning: there are multiple "Report Status" steps in this file (one per job).
          # Make sure they are all updated
          notify_when: "failure"
          status: ${{ job.status }} # required
          # Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
          message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
          footer: "<{run_url}|Failing Run>"
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}

  # This job upgrades production hubs on clusters in parallel, if required. This
  # job needs both the `filter-generate-jobs` to have completed to provide its
  # output `filtered-prod-hub-matrix-jobs`. It is a list of dictionaries with
  # the keys cluster_name, provider, and hub_name for each production hub that
  # requires an upgrade and didn't have a failed support-and-staging-upgrade job
  # run as part of this workflow.
  upgrade-prod-hubs:
    runs-on: ubuntu-latest
    needs: [filter-generate-jobs]
    # Only run this job on pushes to the default branch and when the `generate-jobs` job output is not
    # an empty list
    #
    # always() is added as it seems to be a magic function to ensure a job can
    # run at all without being skipped if a previous job has had any failures. By
    # using always() we need to ensure filter-generate-jobs wasn't skipped or
    # failed explicitly now though.
    # https://docs.github.com/en/actions/learn-github-actions/expressions#always
    if: |
      always() && needs.filter-generate-jobs.result == 'success' &&
      (github.event_name == 'push' && contains(github.ref, 'main')) &&
      needs.filter-generate-jobs.outputs.filtered-prod-hub-matrix-jobs != '[]'
    strategy:
      # Don't stop other deployments if one fails
      fail-fast: false
      matrix:
        jobs: ${{ fromJson(needs.filter-generate-jobs.outputs.filtered-prod-hub-matrix-jobs) }}

    steps:
      - uses: actions/checkout@v4

      - name: Setup deploy for ${{ matrix.jobs.cluster_name }} cluster
        uses: ./.github/actions/setup-deploy
        with:
          provider: ${{ matrix.jobs.provider }}
          GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}

      - name: Upgrade ${{ matrix.jobs.hub_name }} hub on cluster ${{ matrix.jobs.cluster_name }}
        run: |
          deployer deploy ${{ matrix.jobs.cluster_name }} ${{ matrix.jobs.hub_name }}

      # Retry action: https://github.com/marketplace/actions/retry-step
      - name: Run health check against ${{ matrix.jobs.hub_name }} hub on cluster ${{ matrix.jobs.cluster_name}}
        uses: nick-fields/retry@v3
        with:
          timeout_minutes: 10
          max_attempts: 3
          command: |
            deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} ${{ matrix.jobs.hub_name }}

      # https://github.com/ravsamhq/notify-slack-action
      # Needs to be added per job
      # When https://github.com/integrations/slack/issues/1563 gets implemented,
      # we can use that instead
      - name: Report Status
        if: always()
        uses: ravsamhq/notify-slack-action@v2
        with:
          # Warning: there are multiple "Report Status" steps in this file (one per job).
          # Make sure they are all updated
          notify_when: "failure"
          status: ${{ job.status }} # required
          # Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
          message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
          footer: "<{run_url}|Failing Run>"
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}