Skip to content

Commit

Permalink
Create a cron job to regularly garbage collect test resources. (kubef…
Browse files Browse the repository at this point in the history
…low#300)

* Create a cron job to regularly garbage collect test resources.

* Add to cleanup_ci.py an "all" subcommand to delete all resources."
* Add a batch job for one off runs.

Related to:
  kubeflow#87 Cron job to garbage collect test resources
  kubeflow#249 cron job to collect Kubeflow deployments launched by E2E tests

* * Add a cron job to run the cleanup every two hours.
* In cleanup_ci.py; don't load the imports of the manifests
  We encountered an error where the manifest didn't exist. I think
  that may have been a collision because we had a separate script running to do
  the deletes.

* Fix some bugs.

* Deal with config being none.

* Maybe activate service account.
  • Loading branch information
jlewi authored and k8s-ci-robot committed Feb 5, 2019
1 parent da8a4dc commit 83f488d
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 21 deletions.
62 changes: 41 additions & 21 deletions py/kubeflow/testing/cleanup_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,11 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
with open(os.path.join(manifest_dir, "cluster-kubeflow.yaml"), "w") as hf:
hf.write(manifest["config"]["content"])

for i in manifest["imports"]:
with open(os.path.join(manifest_dir, i["name"]), "w") as hf:
hf.write(i["content"])

config = yaml.load(manifest["config"]["content"])

if not config:
logging.warning("Skipping deployment %s because it has no config; "
"is it already being deleted?", name)
zone = config["resources"][0]["properties"]["zone"]
command = [args.delete_script,
"--project=" + args.project, "--deployment=" + name,
Expand Down Expand Up @@ -293,6 +293,30 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
clusters_client.delete(projectId=args.project, zone=zone,
clusterId=name).execute()

def cleanup_all(args):
cleanup_deployments(args)
cleanup_endpoints(args)
cleanup_service_accounts(args)
cleanup_workflows(args)

def add_workflow_args(parser):
parser.add_argument(
"--namespace", default="kubeflow-test-infra",
help="Namespace to cleanup.")

def add_deployments_args(parser):
parser.add_argument(
"--update_first", default=False, type=bool,
help="Whether to update the deployment first.")

parser.add_argument(
"--delete_script", default="", type=str,
help=("The path to the delete_deployment.sh script which is in the "
"Kubeflow repository."))
parser.add_argument(
"--zones", default="us-east1-d,us-central1-a", type=str,
help="Comma separated list of zones to check.")

def main():
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
Expand All @@ -310,15 +334,22 @@ def main():

subparsers = parser.add_subparsers()

######################################################
# Paraser for everything
parser_all = subparsers.add_parser(
"all", help="Cleanup everything")

add_deployments_args(parser_all)
add_workflow_args(parser_all)

parser_all.set_defaults(func=cleanup_all)

######################################################
# Parser for argo_workflows
parser_argo = subparsers.add_parser(
"workflows", help="Cleanup workflows")

parser_argo.add_argument(
"--namespace", default="kubeflow-test-infra",
help="Namespace to cleanup.")

add_workflow_args(parser_argo)
parser_argo.set_defaults(func=cleanup_workflows)

######################################################
Expand All @@ -340,22 +371,11 @@ def main():
parser_deployments = subparsers.add_parser(
"deployments", help="Cleanup deployments")

parser_deployments.add_argument(
"--update_first", default=False, type=bool,
help="Whether to update the deployment first.")

parser_deployments.add_argument(
"--delete_script", default="", type=str,
help=("The path to the delete_deployment.sh script which is in the "
"Kubeflow repository."))

parser_deployments.add_argument(
"--zones", default="us-east1-d,us-central1-a", type=str,
help="Comma separated list of zones to check.")

add_deployments_args(parser_deployments)
parser_deployments.set_defaults(func=cleanup_deployments)
args = parser.parse_args()

util.maybe_activate_service_account()
args.func(args)

if __name__ == "__main__":
Expand Down
36 changes: 36 additions & 0 deletions test-infra/ks_app/components/cleanup-ci-cron.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Oneoff job to cleanup the ci system.
//
local params = std.extVar("__ksonnet/params").components["cleanup-ci-cron"];
local env = std.extVar("__ksonnet/environments");

local k = import "k.libsonnet";
local cleanup = import "cleanup-ci.libsonnet";

local job = {
"apiVersion": "batch/v1beta1",
"kind": "CronJob",
"metadata": {
name: params.name,
namespace: env.namespace,
labels: {
app: "cleanup-ci"
},
},
spec: {
// Every two hours
schedule: "0 */2 * * *" ,
concurrencyPolicy: "Forbid",
jobTemplate: {
metadata: {
labels: {
app: "cleanup-ci",
},
},
spec: cleanup.jobSpec,
},
},
};

std.prune(k.core.v1.list.new([
job,
]))
24 changes: 24 additions & 0 deletions test-infra/ks_app/components/cleanup-ci.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Oneoff job to cleanup the ci system.
//
local params = std.extVar("__ksonnet/params").components["cleanup-ci"];
local env = std.extVar("__ksonnet/environments");

local k = import "k.libsonnet";
local cleanup = import "cleanup-ci.libsonnet";

local job = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
name: params.name,
namespace: env.namespace,
labels: {
app: "cleanup-ci"
},
},
"spec": cleanup.jobSpec,
};

std.prune(k.core.v1.list.new([
job,
]))
89 changes: 89 additions & 0 deletions test-infra/ks_app/components/cleanup-ci.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{

// Build a multi-line container command.
// Input is a list of lists. Where each list describes a command to be run.
// e.g
// [ ["echo", "command-one"], ["echo", "command-two"]]
// Output is a list containing a shell command to run them
// e.g.
// ["/bin/sh", "-xc", "echo command-one; echo command-two"]
buildCommand:: function(items)
["/bin/sh", "-xc"] +
[std.join("; ",
std.map(
function(c) std.join(" ", c),
items,
)
)],

jobSpec:: {
"template": {
"spec": {
"containers": [
{
command: $.buildCommand([[
"/usr/local/bin/checkout.sh",
"/src",
],
[
"python",
"-m",
"kubeflow.testing.cleanup_ci",
"all",
"--delete_script=/src/kubeflow/kubeflow/scripts/gke/delete_deployment.sh",
],
]),
"image": "gcr.io/kubeflow-ci/test-worker/test-worker:v20190116-b7abb8d-e3b0c4",
"name": "label-sync",
env: [
{
name: "REPO_OWNER",
value: "kubeflow",
},
{
name: "REPO_NAME",
value: "testing",
},
{
// TODO(jlewi): Stop setting PULL_NUMBER once the PR is merged.
// We had to set the PR number because when we initially created the
// job we had some changes to cleanup_ci.py that were part of the PR
// committing the job.
name: "PULL_NUMBER",
value: "300",
},
{
name: "PYTHONPATH",
value: "/src/kubeflow/testing/py",
},
{
name: "EXTRA_REPOS",
value: "kubeflow/kubeflow@HEAD",
},
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json",
},
],
"volumeMounts": [
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
readOnly: true
},
]
}
],
"restartPolicy": "Never",
"volumes": [
{
name: "gcp-credentials",
secret: {
secretName: "kubeflow-testing-credentials",
},
},
]
}
}
},
}
6 changes: 6 additions & 0 deletions test-infra/ks_app/components/params.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
version: "v2.2.1",
exposeUi: false,
},
"cleanup-ci": {
name: "cleanup-ci",
},
"cleanup-ci-cron": {
name: "cleanup-ci",
},
"nfs-external": {
name: "nfs-external",
namespace: "kubeflow-test-infra",
Expand Down

0 comments on commit 83f488d

Please sign in to comment.