From e08595f1515fbddb2788e19b81471faf7230f4e4 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy+github@lewi.us>
Date: Tue, 26 Mar 2019 19:28:47 -0700
Subject: [PATCH] Add a playbook for dealing with cleaning up the test
 infrastructure. (#338)

* Add a playbook and describe how to deal with the CI infrastructure running
  out of GCP quota.

* The cron/batch job for the CI system should not be pinned to checkout
  the code at PR 300; we should be using master.

* We are seeing socket errors contacting the DM service so add some retries
  and in the event of permanent failure try to keep going.

Related to: #337
---
 playbook.md                                   | 52 +++++++++++++++++++
 py/kubeflow/testing/cleanup_ci.py             | 24 ++++++++-
 .../ks_app/components/cleanup-ci.libsonnet    |  8 ---
 3 files changed, 74 insertions(+), 10 deletions(-)
 create mode 100644 playbook.md
diff --git a/playbook.md b/playbook.md
new file mode 100644
index 00000000000..9b2d718c373
--- /dev/null
+++ b/playbook.md
@@ -0,0 +1,52 @@
+# Kubeflow Test Infrastructure Playbook
+
+This is a playbook for build cops to help deal with problems with the CI infrastructure.
+
+
+## GCP Quota errors
+
+1. List regional quotas to see which quotas are running hot
+
+   ```
+   gcloud compute regions describe --project=kubeflow-ci ${REGION}
+   ```
+
+1. Check if we are leaking Kubeflow deployments and this is causing us to run out of quota.
+
+   ```
+   gcloud --project=kubeflow-ci --format="table(name,createTime:sort=1,location,status)" container clusters list
+   gcloud --project=kubeflow-ci deployment-manager deployments list --format="table(name,insertTime:sort=1)" 
+   ```
+
+   * Deployments created by the E2E tests should be GC'd after O(2) hours
+   * So if there are resources older than O(2) hours it indicates that there is a problem with
+     garbage collection
+
+1. Check if the cron job to GC resources is running in the test cluster
+
+   ```
+   kubectl get cronjobs
+   NAME                 SCHEDULE       SUSPEND   ACTIVE    LAST SCHEDULE   AGE	
+   cleanup-ci           0 */2 * * *    False     0         <none>          14m
+   ```
+
+   * The cron job is defined in [cleanup-ci-cron.jsonnet](https://github.com/kubeflow/testing/blob/master/test-infra/ks_app/components/cleanup-ci-cron.jsonnet)
+
+   * If the cron job is not configured then start it.
+
+
+1. Look for recent runs of the cron job and figure out whether the are running successfully
+
+   ```
+   kubectl get jobs | grep cleanup-ci
+   ```
+
+   * Jobs triggered by cron will match the regex `cleanup-ci-??????????`
+
+   * Check that the job ran successfully
+
+   * The pods associated with the job can be fetched via labels
+
+     ```
+     kubectl logs -l job-name=${JOBNAME}
+     ```
\ No newline at end of file
diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py
index 87821f7edd1..dd68f544840 100644
--- a/py/kubeflow/testing/cleanup_ci.py
+++ b/py/kubeflow/testing/cleanup_ci.py
@@ -5,6 +5,8 @@
 import logging
 import os
 import re
+import retrying
+import socket
 import subprocess
 import tempfile
 import yaml
@@ -43,6 +45,11 @@ def is_match(name, patterns=None):
 
   return False
 
+def is_retryable_exception(exception):
+  """Return True if we consider the exception retryable"""
+  # Socket errors look like temporary problems connecting to GCP.
+  return isinstance(exception, socket.error)
+
 def cleanup_workflows(args):
   # We need to load the kube config so that we can have credentials to
   # talk to the APIServer.
@@ -347,6 +354,11 @@ def getAge(tsInRFC3339):
   age = datetime.datetime.utcnow()- insert_time_utc
   return age
 
+@retrying.retry(stop_max_attempt=5,
+                retry_on_exception=is_retryable_exception)
+def execute_rpc(rpc):
+  """Execute a Google RPC request with retries."""
+  return rpc.execute()
 
 def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-branches
   if not args.delete_script:
@@ -382,8 +394,16 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
       else:
         manifest_url = d["manifest"]
       manifest_name = manifest_url.split("/")[-1]
-      manifest = manifests_client.get(
-        project=args.project, deployment=name, manifest=manifest_name).execute()
+
+      rpc = manifests_client.get(project=args.project,
+                                 deployment=name,
+                                 manifest=manifest_name)
+      try:
+        manifest = execute_rpc(rpc)
+      except socket.error as e:
+        logging.error("socket error prevented getting manifest %s", e)
+        # Try to continue with deletion rather than aborting.
+        continue
 
       # Create a temporary directory to store the deployment.
       manifest_dir = tempfile.mkdtemp(prefix="tmp" + name)
diff --git a/test-infra/ks_app/components/cleanup-ci.libsonnet b/test-infra/ks_app/components/cleanup-ci.libsonnet
index 3cc51998980..1461dda9e41 100644
--- a/test-infra/ks_app/components/cleanup-ci.libsonnet
+++ b/test-infra/ks_app/components/cleanup-ci.libsonnet
@@ -44,14 +44,6 @@
                   name: "REPO_NAME",
                   value: "testing",                  
                 },
-                {
-                  // TODO(jlewi): Stop setting PULL_NUMBER once the PR is merged.
-                  // We had to set the PR number because when we initially created the
-                  // job we had some changes to cleanup_ci.py that were part of the PR
-                  // committing the job.
-                  name: "PULL_NUMBER",
-                  value: "300",
-                },
                 {
                   name: "PYTHONPATH",
                   value: "/src/kubeflow/testing/py",