Skip to content

[deployer exec root-homes]: wait for the pod to be ready before exec-ing into it #5839

[deployer exec root-homes]: wait for the pod to be ready before exec-ing into it

[deployer exec root-homes]: wait for the pod to be ready before exec-ing into it #5839

Workflow file for this run

name: Deploy and test hubs
on:
push:
branches:
- main
paths:
# Include changes to the deployer script's folder, but exclude some parts
- deployer/**
- "!deployer/README.md"
- "!deployer/health_check_tests/**"
- "!deployer/commands/generate/billing/**"
- "!deployer/commands/generate/dedicated_cluster/**"
- "!deployer/commands/generate/hub_asset/**"
- "!deployer/commands/generate/resource_allocation/**"
- requirements.txt
- .github/actions/setup-deploy/**
- helm-charts/**
- config/clusters/**
# Exclude the template configuration files
- "!config/clusters/templates/**"
- "!terraform/aws/projects/template.tfvars"
- "!terraform/gcp/projects/cluster.tfvars.template"
- "!eksctl/template.jsonnet"
pull_request:
branches:
- main
paths:
# Include changes to the deployer script's folder, but exclude some parts
- deployer/**
- "!deployer/README.md"
- "!deployer/health_check_tests/**"
- "!deployer/commands/generate/billing/**"
- "!deployer/commands/generate/dedicated_cluster/**"
- "!deployer/commands/generate/hub_asset/**"
- "!deployer/commands/generate/resource_allocation/**"
- requirements.txt
- .github/actions/setup-deploy/**
- helm-charts/**
- config/clusters/**
# Exclude the template configuration files
- "!config/clusters/templates/**"
- "!terraform/aws/projects/template.tfvars"
- "!terraform/gcp/projects/cluster.tfvars.template"
- "!eksctl/template.jsonnet"
# Queue triggered executions of this workflow stemming from pushes to avoid
# deployment conflicts.
#
# By using a different concurrency groups for pull requests and pushes, we
# reduce the risk of cancelling a queued but not started workflow as discussed
# in https://github.com/2i2c-org/infrastructure/issues/3214.
#
# github.head_ref is used to create PR unique concurrency groups, and for
# workflow executions not triggered by a PR we get a dedicated group.
#
# ref: https://docs.github.com/en/actions/using-jobs/using-concurrency
#
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || 'not-a-pr' }}
cancel-in-progress: false
# This environment variable triggers the deployer to colourise print statements in the
# GitHug Actions logs for easy reading
env:
TERM: xterm
# This may not be required any more, and it may depend on the kubectl version
# we use etc as well. For now, we have it added to avoid issues.
USE_GKE_GCLOUD_AUTH_PLUGIN: "True"
jobs:
# This job runs in Pull Requests and on pushes to the default branch. It identifies
# which files have been added or modified by recent GitHub activity and parsed a list
# to the `deployer generate helm-upgrade-job`s command of the deployer. This command generates
# two lists of dictionaries, which can be read by GitHub Actions as matrix jobs. The
# first set of jobs describes which clusters need their support chart and/or staging
# hub upgraded; and the second set of jobs describe which production hubs require
# upgrading. These two lists are set as job outputs via GITHUB_OUTPUT to be consumed
# by the later jobs. They are also pretty-printed in a human-readable format to the
# logs, and converted into Markdown tables for posting into GitHub comments.
generate-jobs:
runs-on: ubuntu-latest
outputs:
support-and-staging-matrix-jobs: ${{ steps.generate-jobs.outputs.support-and-staging-matrix-jobs }}
prod-hub-matrix-jobs: ${{ steps.generate-jobs.outputs.prod-hub-matrix-jobs }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
# There will almost never be a cache hit on the cache key when this job is
# run, as it is the first of all jobs in this workflow. An exception is if
# this job is re-attempted as part of the same workflow run after
# succeeding previously.
- name: Save pip's install cache on job completion
uses: actions/cache@v4
with:
path: ~/.cache/pip
# key determines if we define or reuse an existing cache or not. Our
# key ensure we cache within a workflow run and its attempts, but not
# between workflow runs.
key: "${{ github.run_id }}"
- name: Install deployer script's Python dependencies
run: |
pip install --editable .
pip list
- name: Get merged/open PR labels
uses: actions/github-script@v7
id: pr-labels
with:
# Both pull_request and push can have triggered this job to run. A
# context with PR info (including its labels) is available when this
# job is triggered via pull_request, but not when triggered via push -
# a push can be triggered by other things than merged PRs. Due to
# this, we check if a pushed commit is matching some PRs merge commit,
# and if so gets its labels.
script: |
let labels = null;
if (context.eventName === 'pull_request') {
labels = context.payload.pull_request.labels;
}
else if (context.eventName === 'push') {
// api ref: https://octokit.github.io/rest.js/v20#pulls-list
const resp = await github.rest.pulls.list(
{
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed',
sort: 'updated',
direction: 'desc',
per_page: 100,
}
);
const merged_pr = resp.data.find(pr => pr.merge_commit_sha === context.sha);
labels = merged_pr?.labels;
}
label_names = (labels || []).map(l => l.name);
return label_names
- name: Identify files that have been added or modified
# Action repo: https://github.com/dorny/paths-filter
uses: dorny/paths-filter@v3
id: changed-files
with:
token: ""
list-files: csv
filters: |
changed:
- added|modified: deployer/**
- added|modified: requirements.txt
- added|modified: .github/actions/setup-deploy/**
- added|modified: helm-charts/basehub/**
- added|modified: helm-charts/daskhub/**
- added|modified: helm-charts/support/**
- added|modified: config/clusters/**
# This step will create a comment-body.txt file containing the jobs to be run in a
# Markdown table format to be posted on a Pull Request, if this job is triggered
# by one
- name: Generate matrix jobs
id: generate-jobs
run: |
deployer generate helm-upgrade-jobs "${{ steps.changed-files.outputs.changed_files }}" '${{ steps.pr-labels.outputs.result }}'
# The comment-deployment-plan-pr.yaml workflow won't have the correct context to
# know the PR number, so we save it to a file to pass to that workflow
- name: Save Pull Request number to a file
if: github.event_name == 'pull_request'
run: |
echo "${{ github.event.number }}" > pr-number.txt
# Upload the pr-number.txt and comment-body.txt files as artifacts for the
# comment-deployment-plan-pr.yaml workflow to access
- name: Upload artifacts
if: >
github.event_name == 'pull_request' &&
(steps.generate-jobs.outputs.support-and-staging-matrix-jobs != '[]' ||
steps.generate-jobs.outputs.prod-hub-matrix-jobs != '[]')
uses: actions/upload-artifact@v4
with:
name: pr
path: |
pr-number.txt
comment-body.txt
# https://github.com/ravsamhq/notify-slack-action
# Needs to be added per job
# When https://github.com/integrations/slack/issues/1563 gets implemented,
# we can use that instead
- name: Report Status
if: always()
uses: ravsamhq/notify-slack-action@v2
with:
# Warning: there are multiple "Report Status" steps in this file (one per job).
# Make sure they are all updated
notify_when: "failure"
status: ${{ job.status }} # required
# Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
footer: "<{run_url}|Failing Run>"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}
# This job upgrades the support chart, staging hub, and dask-staging hub (if present)
# for clusters in parallel, if those upgrades are required. This job needs the
# `generate-jobs` job to have completed and set an output to the
# `support-and-staging-matrix-jobs` variable name. It's inputs are a list of
# dictionaries with the keys cluster_name, provider, upgrade_support, and
# upgrade_staging for each cluster that requires it.
upgrade-support-and-staging:
runs-on: ubuntu-latest
needs: [generate-jobs]
# We declare outputs indicating the job failed status of a specific job
# variation. We are currently required to do this in a hardcoded fashion,
# see this post for feature requests for this to be improved:
# https://github.community/t/bug-jobs-output-should-return-a-list-for-a-matrix-job/128626/32?u=consideratio
#
# Warning: names can include alphanumerics, '-', and '_', but not '.', so
# we replace '.' for '-' in cluster names.
#
# If you are adding a new cluster, please remember to list it here!
outputs:
failure_2i2c-aws-us: "${{ steps.declare-failure.outputs.failure_2i2c-aws-us }}"
failure_2i2c-uk: "${{ steps.declare-failure.outputs.failure_2i2c-uk }}"
failure_2i2c: "${{ steps.declare-failure.outputs.failure_2i2c }}"
failure_awi-ciroh: "${{ steps.declare-failure.outputs.failure_awi-ciroh }}"
failure_catalystproject-africa: "${{ steps.declare-failure.outputs.failure_catalystproject-africa }}"
failure_catalystproject-latam: "${{ steps.declare-failure.outputs.failure_catalystproject-latam }}"
failure_cloudbank: "${{ steps.declare-failure.outputs.failure_cloudbank }}"
failure_dubois: "${{ steps.declare-failure.outputs.failure_dubois }}"
failure_earthscope: "${{ steps.declare-failure.outputs.failure_earthscope }}"
failure_gridsst: "${{ steps.declare-failure.outputs.failure_gridsst }}"
failure_hhmi: "${{ steps.declare-failure.outputs.failure_hhmi }}"
failure_jupyter-health: "${{ steps.declare-failure.outputs.failure_jupyter-health }}"
failure_jupyter-meets-the-earth: "${{ steps.declare-failure.outputs.failure_jupyter-meets-the-earth }}"
failure_kitware: "${{ steps.declare-failure.outputs.failure_kitware }}"
failure_leap: "${{ steps.declare-failure.outputs.failure_leap }}"
failure_maap: "${{ steps.declare-failure.outputs.failure_maap }}"
failure_nasa-cryo: "${{ steps.declare-failure.outputs.failure_nasa-cryo }}"
failure_nasa-ghg: "${{ steps.declare-failure.outputs.failure_nasa-ghg }}"
failure_nasa-veda: "${{ steps.declare-failure.outputs.failure_nasa-veda }}"
failure_nmfs-openscapes: "${{ steps.declare-failure.outputs.failure_nmfs-openscapes }}"
failure_openscapes: "${{ steps.declare-failure.outputs.failure_openscapes }}"
failure_opensci: "${{ steps.declare-failure.outputs.failure_opensci }}"
failure_pangeo-hubs: "${{ steps.declare-failure.outputs.failure_pangeo-hubs }}"
failure_projectpythia: "${{ steps.declare-failure.outputs.failure_projectpythia }}"
failure_queensu: "${{ steps.declare-failure.outputs.failure_queensu }}"
failure_smithsonian: "${{ steps.declare-failure.outputs.failure_smithsonian }}"
failure_strudel: "${{ steps.declare-failure.outputs.failure_strudel }}"
failure_ubc-eoas: "${{ steps.declare-failure.outputs.failure_ubc-eoas }}"
failure_utoronto: "${{ steps.declare-failure.outputs.failure_utoronto }}"
failure_victor: "${{ steps.declare-failure.outputs.failure_victor }}"
if: |
(github.event_name == 'push' && contains(github.ref, 'main')) &&
needs.generate-jobs.result == 'success' &&
needs.generate-jobs.outputs.support-and-staging-matrix-jobs != '[]'
strategy:
# Don't stop other deployments if one fails
fail-fast: false
matrix:
jobs: ${{ fromJson(needs.generate-jobs.outputs.support-and-staging-matrix-jobs) }}
steps:
- uses: actions/checkout@v4
- name: Setup deploy for ${{ matrix.jobs.cluster_name }}
uses: ./.github/actions/setup-deploy
with:
provider: ${{ matrix.jobs.provider }}
GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}
- name: Upgrade support chart on cluster ${{ matrix.jobs.cluster_name }}
if: matrix.jobs.upgrade_support
run: |
deployer deploy-support ${{ matrix.jobs.cluster_name }}
- name: Upgrade staging hub on cluster ${{ matrix.jobs.cluster_name }}
if: matrix.jobs.upgrade_staging
run: |
deployer deploy ${{ matrix.jobs.cluster_name }} staging
# Retry action: https://github.com/marketplace/actions/retry-step
- name: Run health check for staging hub on cluster ${{ matrix.jobs.cluster_name }}
if: matrix.jobs.upgrade_staging
uses: nick-fields/retry@v3
with:
timeout_minutes: 10
max_attempts: 2
command: |
deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} staging
- name: Upgrade dask-staging hub on cluster ${{ matrix.jobs.cluster_name }} if it exists
if: matrix.jobs.upgrade_staging && matrix.jobs.cluster_name == '2i2c'
run: |
deployer deploy ${{ matrix.jobs.cluster_name }} dask-staging
# Retry action: https://github.com/marketplace/actions/retry-step
- name: Run health check for dask-staging hub on cluster ${{ matrix.jobs.cluster_name }} if it exists
if: matrix.jobs.upgrade_staging && matrix.jobs.cluster_name == '2i2c'
uses: nick-fields/retry@v3
with:
timeout_minutes: 10
max_attempts: 2
command: |
deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} dask-staging
- name: Declare failure status
id: declare-failure
if: always()
shell: python
run: |
import os
name = "${{ matrix.jobs.cluster_name }}".replace(".", "-")
failure = "${{ job.status == 'failure' }}"
output_file = os.getenv("GITHUB_OUTPUT")
with open(output_file, "a") as f:
f.write(f"failure_{name}={failure}")
# https://github.com/ravsamhq/notify-slack-action
# Needs to be added per job
# FIXME: when https://github.com/integrations/slack/issues/1563 gets implemented,
# we can use that instead
- name: Report Status
if: always()
uses: ravsamhq/notify-slack-action@v2
# Warning: there are multiple "Report Status" steps in this file (one per job).
# Make sure they are all updated
with:
notify_when: "failure"
status: ${{ job.status }} # required
# Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
footer: "<{run_url}|Failing Run>"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}
# This jobs reduces the initially planned prod-hub-matrix-jobs deployments by
# filtering out any deployment to a cluster with a failed support-and-staging
# job.
filter-generate-jobs:
runs-on: ubuntu-latest
needs: [generate-jobs, upgrade-support-and-staging]
if: |
!cancelled() &&
(github.event_name == 'push' && contains(github.ref, 'main')) &&
needs.generate-jobs.result == 'success' &&
needs.generate-jobs.outputs.prod-hub-matrix-jobs != '[]'
outputs:
prod-hub-matrix-jobs: ${{ steps.filter-jobs.outputs.prod-hub-matrix-jobs }}
steps:
# This Python script filters out any prod hub deployment job from running
# later based on if its part of a cluster where support/staging upgrade
# just failed. Data is injected to the script before its executed via
# string literals as rendered GitHub workflow expressions.
- name: Filter prod deploy jobs to run based on failures in support/staging
id: filter-jobs
shell: python
run: |
import os
import json
jobs = json.loads(r"""${{ needs.generate-jobs.outputs.prod-hub-matrix-jobs }}""")
outputs = json.loads(r"""${{ toJson(needs.upgrade-support-and-staging.outputs) }}""")
try:
filtered_jobs = [
job
for job in jobs
if outputs[f"failure_{job['cluster_name'].replace('.', '-')}"] != "true"
]
except KeyError:
print(f"The {cluster_name} cluster wasn't found in the `upgrade-support-and-staging.outputs` list. Please add it before continuing!")
output_file = os.getenv("GITHUB_OUTPUT")
with open(output_file, "a") as f:
f.write(f"prod-hub-matrix-jobs={json.dumps(filtered_jobs)}")
# https://github.com/ravsamhq/notify-slack-action
# Needs to be added per job
# When https://github.com/integrations/slack/issues/1563 gets implemented,
# we can use that instead
- name: Report Status
if: always()
uses: ravsamhq/notify-slack-action@v2
with:
# Warning: there are multiple "Report Status" steps in this file (one per job).
# Make sure they are all updated
notify_when: "failure"
status: ${{ job.status }} # required
# Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
footer: "<{run_url}|Failing Run>"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}
# This job upgrades production hubs on clusters in parallel, if required. This
# job needs both the `filter-generate-jobs` to have completed to provide its
# output `filtered-prod-hub-matrix-jobs`. It is a list of dictionaries with
# the keys cluster_name, provider, and hub_name for each production hub that
# requires an upgrade and didn't have a failed support-and-staging-upgrade job
# run as part of this workflow.
upgrade-prod-hubs:
runs-on: ubuntu-latest
needs: [filter-generate-jobs]
if: |
!cancelled() &&
(github.event_name == 'push' && contains(github.ref, 'main')) &&
needs.filter-generate-jobs.result == 'success' &&
needs.filter-generate-jobs.outputs.prod-hub-matrix-jobs != '[]'
strategy:
# Don't stop other deployments if one fails
fail-fast: false
matrix:
jobs: ${{ fromJson(needs.filter-generate-jobs.outputs.prod-hub-matrix-jobs) }}
steps:
- uses: actions/checkout@v4
- name: Setup deploy for ${{ matrix.jobs.cluster_name }} cluster
uses: ./.github/actions/setup-deploy
with:
provider: ${{ matrix.jobs.provider }}
GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}
- name: Upgrade ${{ matrix.jobs.hub_name }} hub on cluster ${{ matrix.jobs.cluster_name }}
run: |
deployer deploy ${{ matrix.jobs.cluster_name }} ${{ matrix.jobs.hub_name }}
# Retry action: https://github.com/marketplace/actions/retry-step
- name: Run health check against ${{ matrix.jobs.hub_name }} hub on cluster ${{ matrix.jobs.cluster_name}}
uses: nick-fields/retry@v3
with:
timeout_minutes: 10
max_attempts: 3
command: |
deployer run-hub-health-check ${{ matrix.jobs.cluster_name }} ${{ matrix.jobs.hub_name }}
# https://github.com/ravsamhq/notify-slack-action
# Needs to be added per job
# When https://github.com/integrations/slack/issues/1563 gets implemented,
# we can use that instead
- name: Report Status
if: always()
uses: ravsamhq/notify-slack-action@v2
with:
# Warning: there are multiple "Report Status" steps in this file (one per job).
# Make sure they are all updated
notify_when: "failure"
status: ${{ job.status }} # required
# Message should look like: "Hey @author! Deploy and test hubs failed for "Add new hub - 9305e08".
message_format: '{emoji} Hey @${{ github.event.head_commit.author.name }}! *{workflow}* {status_message} for "${{ github.event.head_commit.message }} - <{commit_url}|{commit_sha}>". Checkout the run at {run_url}.'
footer: "<{run_url}|Failing Run>"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GHA_FAILURES_WEBHOOK_URL }}