From e08595f1515fbddb2788e19b81471faf7230f4e4 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Tue, 26 Mar 2019 19:28:47 -0700 Subject: [PATCH] Add a playbook for dealing with cleaning up the test infrastructure. (#338) * Add a playbook and describe how to deal with the CI infrastructure running out of GCP quota. * The cron/batch job for the CI system should not be pinned to checkout the code at PR 300; we should be using master. * We are seeing socket errors contacting the DM service so add some retries and in the event of permanent failure try to keep going. Related to: #337 --- playbook.md | 52 +++++++++++++++++++ py/kubeflow/testing/cleanup_ci.py | 24 ++++++++- .../ks_app/components/cleanup-ci.libsonnet | 8 --- 3 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 playbook.md diff --git a/playbook.md b/playbook.md new file mode 100644 index 00000000000..9b2d718c373 --- /dev/null +++ b/playbook.md @@ -0,0 +1,52 @@ +# Kubeflow Test Infrastructure Playbook + +This is a playbook for build cops to help deal with problems with the CI infrastructure. + + +## GCP Quota errors + +1. List regional quotas to see which quotas are running hot + + ``` + gcloud compute regions describe --project=kubeflow-ci ${REGION} + ``` + +1. Check if we are leaking Kubeflow deployments and this is causing us to run out of quota. + + ``` + gcloud --project=kubeflow-ci --format="table(name,createTime:sort=1,location,status)" container clusters list + gcloud --project=kubeflow-ci deployment-manager deployments list --format="table(name,insertTime:sort=1)" + ``` + + * Deployments created by the E2E tests should be GC'd after O(2) hours + * So if there are resources older than O(2) hours it indicates that there is a problem with + garbage collection + +1. Check if the cron job to GC resources is running in the test cluster + + ``` + kubectl get cronjobs + NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE + cleanup-ci 0 */2 * * * False 0 14m + ``` + + * The cron job is defined in [cleanup-ci-cron.jsonnet](https://github.com/kubeflow/testing/blob/master/test-infra/ks_app/components/cleanup-ci-cron.jsonnet) + + * If the cron job is not configured then start it. + + +1. Look for recent runs of the cron job and figure out whether the are running successfully + + ``` + kubectl get jobs | grep cleanup-ci + ``` + + * Jobs triggered by cron will match the regex `cleanup-ci-??????????` + + * Check that the job ran successfully + + * The pods associated with the job can be fetched via labels + + ``` + kubectl logs -l job-name=${JOBNAME} + ``` \ No newline at end of file diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py index 87821f7edd1..dd68f544840 100644 --- a/py/kubeflow/testing/cleanup_ci.py +++ b/py/kubeflow/testing/cleanup_ci.py @@ -5,6 +5,8 @@ import logging import os import re +import retrying +import socket import subprocess import tempfile import yaml @@ -43,6 +45,11 @@ def is_match(name, patterns=None): return False +def is_retryable_exception(exception): + """Return True if we consider the exception retryable""" + # Socket errors look like temporary problems connecting to GCP. + return isinstance(exception, socket.error) + def cleanup_workflows(args): # We need to load the kube config so that we can have credentials to # talk to the APIServer. @@ -347,6 +354,11 @@ def getAge(tsInRFC3339): age = datetime.datetime.utcnow()- insert_time_utc return age +@retrying.retry(stop_max_attempt=5, + retry_on_exception=is_retryable_exception) +def execute_rpc(rpc): + """Execute a Google RPC request with retries.""" + return rpc.execute() def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-branches if not args.delete_script: @@ -382,8 +394,16 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br else: manifest_url = d["manifest"] manifest_name = manifest_url.split("/")[-1] - manifest = manifests_client.get( - project=args.project, deployment=name, manifest=manifest_name).execute() + + rpc = manifests_client.get(project=args.project, + deployment=name, + manifest=manifest_name) + try: + manifest = execute_rpc(rpc) + except socket.error as e: + logging.error("socket error prevented getting manifest %s", e) + # Try to continue with deletion rather than aborting. + continue # Create a temporary directory to store the deployment. manifest_dir = tempfile.mkdtemp(prefix="tmp" + name) diff --git a/test-infra/ks_app/components/cleanup-ci.libsonnet b/test-infra/ks_app/components/cleanup-ci.libsonnet index 3cc51998980..1461dda9e41 100644 --- a/test-infra/ks_app/components/cleanup-ci.libsonnet +++ b/test-infra/ks_app/components/cleanup-ci.libsonnet @@ -44,14 +44,6 @@ name: "REPO_NAME", value: "testing", }, - { - // TODO(jlewi): Stop setting PULL_NUMBER once the PR is merged. - // We had to set the PR number because when we initially created the - // job we had some changes to cleanup_ci.py that were part of the PR - // committing the job. - name: "PULL_NUMBER", - value: "300", - }, { name: "PYTHONPATH", value: "/src/kubeflow/testing/py",