fix(k8s): re-enable showing logs + failing fast on Helm errors

Previously we would use Garden's own resource monitoring to monitor the health of a Helm install/upgrade and fail fast if one of the resources was unhealthy as well as show K8s events and Pod logs. In commit 7a68373 we replaced that with Helm's own `--wait` flag. The reason for that change was because a user reported issues with Garden returning early from the Helm command in the success case (see #6078 and #6053 for more details). The problem with that change is that since we weren't using our own resource monitoring we stopped showing events and logs when Helm installs/upgrades fail. Another problem is that Garden would now wait for the Helm command to complete which in the case of unhealthy resources means Helm will timeout, with a default of 300 seconds. This commit fixes that and we try to go for the best of both worlds: - We always use the `--wait` flag but also monitor resources at the same time - If the resources are healthy we wait for the Helm command to complete (this was the intent with 7a68373) - If we detect unhealthy resources we fail fast (as we did before on the same major version) - We add a flag to overwrite the fail fast behaviour in case a user might prefer that
garden-io · Nov 12, 2024 · 73836bb · 73836bb
1 parent 227e44b
commit 73836bb
Show file tree

Hide file tree

Showing 16 changed files with 443 additions and 62 deletions.
diff --git a/core/src/plugins/kubernetes/helm/config.ts b/core/src/plugins/kubernetes/helm/config.ts
@@ -42,6 +42,7 @@ interface HelmChartSpec {
 
 interface HelmDeployActionSpec {
   atomic: boolean
+  waitForUnhealthyResources: boolean
   chart?: HelmChartSpec
   defaultTarget?: KubernetesTargetResourceSpec
   sync?: KubernetesDeploySyncSpec
@@ -166,14 +167,31 @@ const helmChartSpecSchema = () =>
     )
 
 export const defaultHelmAtomicFlag = false
-export const defaultHelmAtomicFlagDesc = `Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll back, instead of having Helm do it automatically on failure.`
+export const defaultHelmAtomicFlagDesc = dedent`
+  Whether to set the \`--atomic\` flag during installs and upgrades. Set to \`true\` if you'd like the changes applied
+  to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
+  roll back, instead of having Helm do it automatically on failure.
+
+  Note that setting \`atomic\` to \`true\` implies \`wait\`.
+`
 
 export const helmDeploySchema = () =>
   joi
     .object()
     .keys({
       ...helmCommonSchemaKeys(),
       atomic: joi.boolean().default(defaultHelmAtomicFlag).description(defaultHelmAtomicFlagDesc),
+      waitForUnhealthyResources: joi.boolean().default(false).description(dedent`
+        Whether to wait for the Helm command to complete before throwing an error if one of the resources being installed/upgraded is unhealthy.
+
+        By default, Garden will monitor the resources being created by Helm and throw an error as soon as one of them is unhealthy. This allows Garden to fail fast if there's an issue with one of the resources. If no issue is detected, Garden waits for the Helm command to complete.
+
+        If however \`waitForUnhealthyResources\` is set to \`true\` and some resources are unhealthy, then Garden will wait for Helm itself to throw an error which typically happens when it times out in the case of unhealthy resources (e.g. due to \`ImagePullBackOff\` or \`CrashLoopBackOff\` errors).
+
+        Waiting for the timeout can take awhile so using the default value here is recommended unless you'd like to completely mimic Helm's behaviour and not rely on Garden's resource monitoring.
+
+        Note that setting \`atomic\` to \`true\` implies \`waitForUnhealthyResources\`.
+      `),
       chart: helmChartSpecSchema(),
       defaultTarget: defaultTargetSchema(),
       sync: kubernetesDeploySyncSchema(),

diff --git a/core/src/plugins/kubernetes/helm/deployment.ts b/core/src/plugins/kubernetes/helm/deployment.ts
@@ -24,7 +24,49 @@ import { isEmpty } from "lodash-es"
 import { getK8sIngresses } from "../status/ingress.js"
 import { toGardenError } from "../../../exceptions.js"
 import { upsertConfigMap } from "../util.js"
-import type { SyncableResource } from "../types.js"
+import type { KubernetesResource, SyncableResource } from "../types.js"
+import { isTruthy } from "../../../util/util.js"
+import { styles } from "../../../logger/styles.js"
+import type { ActionLog } from "../../../logger/log-entry.js"
+
+type WrappedInstallError = { source: "helm" | "waitForResources"; error: unknown }
+
+function isWrappedInstallError(error: unknown): error is WrappedInstallError {
+  return (
+    typeof error === "object" &&
+    error !== null &&
+    "error" in error &&
+    "source" in error &&
+    (error.source === "helm" || error.source === "waitForResources")
+  )
+}
+
+function isErrorWithMessage(error: unknown): error is { message: string } {
+  return typeof error === "object" && error !== null && "message" in error
+}
+
+async function getUnhealthyResourceLogs({
+  namespace,
+  log,
+  manifests,
+  api,
+}: {
+  namespace: string
+  log: ActionLog
+  manifests: KubernetesResource[]
+  api: KubeApi
+}): Promise<string | null> {
+  const unhealthyResources = (await checkResourceStatuses({ api, namespace, manifests, log })).filter(
+    (r) => r.state === "unhealthy"
+  )
+  const logsArr = unhealthyResources.map((r) => r.logs).filter(isTruthy)
+
+  if (logsArr.length === 0) {
+    return null
+  }
+
+  return logsArr.join("\n\n")
+}
 
 export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async (params) => {
   const { ctx, action, log, force } = params
@@ -63,43 +105,125 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
   ]
 
   if (spec.atomic) {
-    // Make sure chart gets purged if it fails to install. Note: --atomic implies --wait.
+    // This options means that the chart gets purged if it fails to install
     commonArgs.push("--atomic")
   }
 
-  if (releaseStatus.state === "missing") {
-    log.silly(() => `Installing Helm release ${releaseName}`)
-    const installArgs = ["install", releaseName, ...reference, ...commonArgs]
+  let helmArgs: string[]
+  const shouldInstall = releaseStatus.state === "missing"
+  if (shouldInstall) {
+    helmArgs = ["install", releaseName, ...reference, ...commonArgs]
     if (force && !ctx.production) {
-      installArgs.push("--replace")
+      helmArgs.push("--replace")
     }
-    await helm({ ctx: k8sCtx, namespace, log, args: [...installArgs], emitLogEvents: true })
   } else {
-    log.silly(() => `Upgrading Helm release ${releaseName}`)
-    const upgradeArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
-    await helm({ ctx: k8sCtx, namespace, log, args: [...upgradeArgs], emitLogEvents: true })
-
-    // If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
-    // that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
-    // dangling annotations created by Garden Cloud.
-    if (ctx.cloudApi) {
-      try {
-        const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
-        await Promise.all(
-          pausedResources.map((resource) => {
-            const { annotations } = resource.metadata
-            if (annotations) {
-              delete annotations[gardenCloudAECPauseAnnotation]
-              return api.annotateResource({ log, resource, annotations })
-            }
-            return
-          })
-        )
-      } catch (error) {
-        const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
-        log.warn(errorMsg)
-        log.debug({ error: toGardenError(error) })
-      }
+    helmArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
+  }
+
+  const preparedManifests = await prepareManifests({
+    ctx: k8sCtx,
+    log,
+    action,
+    ...preparedTemplates,
+  })
+  const manifests = await filterManifests(preparedManifests)
+
+  // We never fail fast with --atomic
+  const failFast = spec.atomic === false && spec.waitForUnhealthyResources === false
+  let wrappedInstallError: unknown | null = null
+  // This is basically an internal field that's only used for testing. Couldn't think of a better approach -E
+  let helmCommandSuccessful = false
+  const helmPromise = helm({ ctx: k8sCtx, namespace, log, args: [...helmArgs], emitLogEvents: true })
+    .then(() => {
+      helmCommandSuccessful = true
+    })
+    .catch((error) => {
+      throw { source: "helm", error }
+    })
+
+  log.debug(() => `${shouldInstall ? "Installing" : "Upgrading"} Helm release ${releaseName}`)
+  if (failFast) {
+    // In this case we use Garden's resource monitoring and fail fast if one of the resources being installed is unhealthy.
+    log.silly(() => `Will fail fast if Helm resources are unhealthy`)
+    const waitForResourcesPromise = waitForResources({
+      namespace,
+      ctx: k8sCtx,
+      provider: k8sCtx.provider,
+      actionName: action.key(),
+      resources: manifests,
+      log,
+      timeoutSec: action.getConfig("timeout"),
+    }).catch((error) => {
+      throw { source: "waitForResources", error }
+    })
+
+    // Wait for either the first error or Helm completion
+    try {
+      await Promise.race([
+        // Wait for helm to complete
+        helmPromise,
+        // If either throws, this will reject
+        Promise.all([helmPromise, waitForResourcesPromise]),
+      ])
+    } catch (err) {
+      wrappedInstallError = err
+    }
+  } else {
+    // In this case we don't monitor the resources and simply let the Helm command run until completion
+    log.silly(() => `Will not fail fast if Helm resources are unhealthy but wait for Helm to complete`)
+    try {
+      await helmPromise
+    } catch (err) {
+      wrappedInstallError = err
+    }
+  }
+
+  if (wrappedInstallError) {
+    if (!isWrappedInstallError(wrappedInstallError)) {
+      throw wrappedInstallError
+    }
+
+    const error = wrappedInstallError.error
+
+    // If it's a direct Helm error we try get the logs and events for the resources and add them to the error message
+    // unless --atomic=true because in that case the events and logs won't be available after the roll back.
+    // If it's an error from the resource monitoring it will already contain the logs and events.
+    if (wrappedInstallError.source === "helm" && !spec.atomic && isErrorWithMessage(error)) {
+      const logs = await getUnhealthyResourceLogs({
+        namespace,
+        log,
+        manifests,
+        api,
+      })
+      error.message += styles.primary(
+        `\n\nFound unhealthy resources for release ${styles.accent(releaseName)}. Below are Kubernetes events and (if applicable) Pod logs from the unhealthy resources.\n\n`
+      )
+      error.message += logs
+    }
+
+    throw error
+  }
+
+  // If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
+  // that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
+  // dangling annotations created by Garden Cloud.
+  if (ctx.cloudApi) {
+    try {
+      const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
+      await Promise.all(
+        pausedResources.map((resource) => {
+          const { annotations } = resource.metadata
+          if (annotations) {
+            delete annotations[gardenCloudAECPauseAnnotation]
+            return api.annotateResource({ log, resource, annotations })
+          }
+          return
+        })
+      )
+    } catch (error) {
+      const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
+      log.warn(errorMsg)
+      log.debug({ error: toGardenError(error) })
     }
   }
 
@@ -119,14 +243,6 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
     data: gardenMetadata,
   })
 
-  const preparedManifests = await prepareManifests({
-    ctx: k8sCtx,
-    log,
-    action,
-    ...preparedTemplates,
-  })
-  const manifests = await filterManifests(preparedManifests)
-
   const mode = action.mode()
 
   // Because we need to modify the Deployment, and because there is currently no reliable way to do that before
@@ -205,7 +321,7 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
       state: "ready",
       version: action.versionString(),
       ingresses,
-      detail: { remoteResources: statuses.map((s) => s.resource) },
+      detail: { remoteResources: statuses.map((s) => s.resource), helmCommandSuccessful },
     },
     attached,
     // TODO-0.13.1

diff --git a/core/src/plugins/kubernetes/helm/handlers.ts b/core/src/plugins/kubernetes/helm/handlers.ts
@@ -187,6 +187,8 @@ function prepareDeployAction({
 
     spec: {
       atomic: module.spec.atomicInstall,
+      // This option is not available on Modules so we default to false when converting from modules
+      waitForUnhealthyResources: false,
       portForwards: module.spec.portForwards,
       namespace: module.spec.namespace,
       releaseName: module.spec.releaseName || module.name,

diff --git a/core/src/plugins/kubernetes/helm/helm-cli.ts b/core/src/plugins/kubernetes/helm/helm-cli.ts
@@ -117,7 +117,9 @@ export async function helm({
   }
 
   const outputStream = split2()
-  outputStream.on("error", () => {})
+  outputStream.on("error", () => {
+    // Do nothing
+  })
   outputStream.on("data", (line: Buffer) => {
     if (emitLogEvents) {
       ctx.events.emit("log", { timestamp: new Date().toISOString(), msg: line.toString(), ...logEventContext })

diff --git a/core/src/plugins/kubernetes/kubernetes-type/handlers.ts b/core/src/plugins/kubernetes/kubernetes-type/handlers.ts
@@ -430,7 +430,6 @@ export const kubernetesDeploy: DeployActionHandler<"deploy", KubernetesDeployAct
       waitForJobs: spec.waitForJobs,
     })
   }
-
   const status = await getKubernetesDeployStatus(<any>params)
 
   // Make sure port forwards work after redeployment

diff --git a/core/src/plugins/kubernetes/status/status.ts b/core/src/plugins/kubernetes/status/status.ts
@@ -268,6 +268,9 @@ interface WaitParams {
 
 /**
  * Wait until the rollout is complete for each of the given Kubernetes objects
+ *
+ * @throws {DeploymentResourceStatusError} as soon as resource with state="unhealthy" is found
+ * @throws {DeploymentError} if it times out waiting for resource
  */
 export async function waitForResources({
   namespace,

diff --git a/core/src/plugins/kubernetes/status/workload.ts b/core/src/plugins/kubernetes/status/workload.ts
@@ -113,17 +113,15 @@ export async function checkWorkloadStatus({ api, namespace, resource }: StatusHa
       podLogs = null
     }
 
-    logs += styles.accent(
-      `\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
-    )
     if (podLogs) {
+      logs += styles.accent(
+        `\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
+      )
       logs += podLogs
       logs += styles.primary(dedent`
         \n💡 Garden hint: For complete Pod logs for this ${workload.kind}, run the following command:
         ${styles.command(`kubectl -n ${namespace} --context=${api.context} logs ${workload.kind.toLowerCase()}/${workload.metadata.name} --all-containers`)}
       `)
-    } else {
-      logs += "<No Pod logs found>"
     }
 
     return <ResourceStatus>{

diff --git a/core/src/plugins/kubernetes/util.ts b/core/src/plugins/kubernetes/util.ts
@@ -463,8 +463,8 @@ export async function upsertConfigMap({
  * becomes
  * `[{ metadata: { name: a }}, { metadata: { name: b }}, { metadata: { name: b }}]`
  */
-export function flattenResources(resources: KubernetesResource[]) {
-  return flatten(resources.map((r: any) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
+export function flattenResources(resources: KubernetesResource[]): KubernetesResource[] {
+  return flatten(resources.map((r: KubernetesResource) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
 }
 
 /**

diff --git a/core/test/data/test-projects/helm/api/garden.yml b/core/test/data/test-projects/helm/api/garden.yml
@@ -13,25 +13,24 @@ spec:
     kind: Deployment
     name: api-release
   values:
+    args: [python, app.py]
     image:
       repository: api-image
       tag: ${actions.build.api-image.version}
     ingress:
       enabled: true
       paths: [/]
       hosts: [api.local.demo.garden]
-
 ---
-
 kind: Module
 description: The API backend for the voting UI
 type: helm
 name: api-module
 releaseName: api-module-release
 devMode:
   sync:
-  - target: /app
-    mode: two-way
+    - target: /app
+      mode: two-way
 serviceResource:
   kind: Deployment
   containerModule: api-image

diff --git a/core/test/data/test-projects/helm/api/templates/deployment.yaml b/core/test/data/test-projects/helm/api/templates/deployment.yaml
@@ -24,7 +24,8 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
-          args: [python, app.py]
+          args:
+            {{- toYaml .Values.args | nindent 12 }}
           ports:
             - name: http
               containerPort: 80

diff --git a/core/test/data/test-projects/helm/api/values.yaml b/core/test/data/test-projects/helm/api/values.yaml
@@ -29,6 +29,8 @@ ingress:
   #    hosts:
   #      - chart-example.local
 
+args: []
+
 resources: {}
   # We usually recommend not to specify default resources and to leave this as a conscious
   # choice for the user. This also increases chances charts run on environments with little

diff --git a/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml b/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml
@@ -3,4 +3,4 @@ type: kubernetes
 name: with-build-action
 build: exec-build
 spec:
-  files: [ "*.yaml" ]
+  files: ["*.yaml"]
-Original file line number
+Diff line change
@@ Expand Up @@
           waitForJobs: spec.waitForJobs,
         })
       }
       const status = await getKubernetesDeployStatus(<any>params)
       // Make sure port forwards work after redeployment
@@ Expand Down @@