From e2a1e8750cf1d7e93f7215d6207ce668417e2116 Mon Sep 17 00:00:00 2001
From: Thorarinn Sigurdsson <thorarinnsigurdsson@gmail.com>
Date: Thu, 7 Mar 2019 14:12:30 +0100
Subject: [PATCH] fix: increase init delay for liveness probe

The initial delay in the liveness probe config for k8s health checks was
too short. This was due to the mistaken assumption that k8s would only
start running the liveness probe after the readiness probe succeeded.

For services that take a long time to start up, the old config would
sometimes lead to the liveness probe failing and thus the container
being restarted - repeatedly, in some situations.

The initial delay and frequency of the readiness probe were also
reduced.

Later, we should aim to make initialDelaySeconds configurable for the
liveness probe.
---
 .../plugins/kubernetes/container/deployment.ts    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/garden-service/src/plugins/kubernetes/container/deployment.ts b/garden-service/src/plugins/kubernetes/container/deployment.ts
index 8a55e235a0..10a261778c 100644
--- a/garden-service/src/plugins/kubernetes/container/deployment.ts
+++ b/garden-service/src/plugins/kubernetes/container/deployment.ts
@@ -269,16 +269,23 @@ function deploymentConfig(service: Service, configuredReplicas: number, namespac
 
 function configureHealthCheck(container, spec): void {
 
+  const readinessPeriodSeconds = 1
+  const readinessFailureThreshold = 90
+
   container.readinessProbe = {
-    initialDelaySeconds: 10,
-    periodSeconds: 5,
+    initialDelaySeconds: 2,
+    periodSeconds: readinessPeriodSeconds,
     timeoutSeconds: 3,
     successThreshold: 2,
-    failureThreshold: 5,
+    failureThreshold: readinessFailureThreshold,
   }
 
+  /*
+   * We wait for the effective failure duration (period * threshold) of the readiness probe before starting the
+   * liveness probe.
+   */
   container.livenessProbe = {
-    initialDelaySeconds: 15,
+    initialDelaySeconds: readinessPeriodSeconds * readinessFailureThreshold,
     periodSeconds: 5,
     timeoutSeconds: 3,
     successThreshold: 1,