From 73836bb553cc1db4b6291ed57583c1bd82cc0e1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ey=C3=BE=C3=B3r=20Magn=C3=BAsson?= <system@garden.io>
Date: Tue, 22 Oct 2024 14:08:35 +0200
Subject: [PATCH] fix(k8s): re-enable showing logs + failing fast on Helm
 errors

Previously we would use Garden's own resource monitoring to monitor the
health of a Helm install/upgrade and fail fast if one of the resources
was unhealthy as well as show K8s events and Pod logs.

In commit 7a68373 we replaced that with Helm's own `--wait` flag.

The reason for that change was because a user reported issues with
Garden returning early from the Helm command in the success case
(see #6078 and #6053 for more details).

The problem with that change is that since we weren't using our own
resource monitoring we stopped showing events and logs when Helm
installs/upgrades fail. Another problem is that Garden would now wait
for the Helm command to complete which in the case of unhealthy
resources means Helm will timeout, with a default of 300 seconds.

This commit fixes that and we try to go for the best of both worlds:

- We always use the `--wait` flag but also monitor resources at the same time
- If the resources are healthy we wait for the Helm command to
  complete (this was the intent with 7a68373)
- If we detect unhealthy resources we fail fast (as we did before on the
  same major version)
- We add a flag to overwrite the fail fast behaviour in case a user
  might prefer that
---
 core/src/plugins/kubernetes/helm/config.ts    |  20 +-
 .../src/plugins/kubernetes/helm/deployment.ts | 198 ++++++++++++----
 core/src/plugins/kubernetes/helm/handlers.ts  |   2 +
 core/src/plugins/kubernetes/helm/helm-cli.ts  |   4 +-
 .../kubernetes/kubernetes-type/handlers.ts    |   1 -
 core/src/plugins/kubernetes/status/status.ts  |   3 +
 .../src/plugins/kubernetes/status/workload.ts |   8 +-
 core/src/plugins/kubernetes/util.ts           |   4 +-
 .../data/test-projects/helm/api/garden.yml    |   7 +-
 .../helm/api/templates/deployment.yaml        |   3 +-
 .../data/test-projects/helm/api/values.yaml   |   2 +
 .../with-build-action/with-build.garden.yml   |   2 +-
 .../src/plugins/kubernetes/helm/deployment.ts | 212 ++++++++++++++++++
 docs/reference/action-types/Deploy/helm.md    |  24 +-
 docs/reference/module-types/helm.md           |  14 +-
 examples/vote-helm/api.garden.yml             |   1 +
 16 files changed, 443 insertions(+), 62 deletions(-)

diff --git a/core/src/plugins/kubernetes/helm/config.ts b/core/src/plugins/kubernetes/helm/config.ts
index d066860ba1..afe03a22ac 100644
--- a/core/src/plugins/kubernetes/helm/config.ts
+++ b/core/src/plugins/kubernetes/helm/config.ts
@@ -42,6 +42,7 @@ interface HelmChartSpec {
 
 interface HelmDeployActionSpec {
   atomic: boolean
+  waitForUnhealthyResources: boolean
   chart?: HelmChartSpec
   defaultTarget?: KubernetesTargetResourceSpec
   sync?: KubernetesDeploySyncSpec
@@ -166,7 +167,13 @@ const helmChartSpecSchema = () =>
     )
 
 export const defaultHelmAtomicFlag = false
-export const defaultHelmAtomicFlagDesc = `Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll back, instead of having Helm do it automatically on failure.`
+export const defaultHelmAtomicFlagDesc = dedent`
+  Whether to set the \`--atomic\` flag during installs and upgrades. Set to \`true\` if you'd like the changes applied
+  to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
+  roll back, instead of having Helm do it automatically on failure.
+
+  Note that setting \`atomic\` to \`true\` implies \`wait\`.
+`
 
 export const helmDeploySchema = () =>
   joi
@@ -174,6 +181,17 @@ export const helmDeploySchema = () =>
     .keys({
       ...helmCommonSchemaKeys(),
       atomic: joi.boolean().default(defaultHelmAtomicFlag).description(defaultHelmAtomicFlagDesc),
+      waitForUnhealthyResources: joi.boolean().default(false).description(dedent`
+        Whether to wait for the Helm command to complete before throwing an error if one of the resources being installed/upgraded is unhealthy.
+
+        By default, Garden will monitor the resources being created by Helm and throw an error as soon as one of them is unhealthy. This allows Garden to fail fast if there's an issue with one of the resources. If no issue is detected, Garden waits for the Helm command to complete.
+
+        If however \`waitForUnhealthyResources\` is set to \`true\` and some resources are unhealthy, then Garden will wait for Helm itself to throw an error which typically happens when it times out in the case of unhealthy resources (e.g. due to \`ImagePullBackOff\` or \`CrashLoopBackOff\` errors).
+
+        Waiting for the timeout can take awhile so using the default value here is recommended unless you'd like to completely mimic Helm's behaviour and not rely on Garden's resource monitoring.
+
+        Note that setting \`atomic\` to \`true\` implies \`waitForUnhealthyResources\`.
+      `),
       chart: helmChartSpecSchema(),
       defaultTarget: defaultTargetSchema(),
       sync: kubernetesDeploySyncSchema(),
diff --git a/core/src/plugins/kubernetes/helm/deployment.ts b/core/src/plugins/kubernetes/helm/deployment.ts
index 16ec94fbf7..daeed26c63 100644
--- a/core/src/plugins/kubernetes/helm/deployment.ts
+++ b/core/src/plugins/kubernetes/helm/deployment.ts
@@ -24,7 +24,49 @@ import { isEmpty } from "lodash-es"
 import { getK8sIngresses } from "../status/ingress.js"
 import { toGardenError } from "../../../exceptions.js"
 import { upsertConfigMap } from "../util.js"
-import type { SyncableResource } from "../types.js"
+import type { KubernetesResource, SyncableResource } from "../types.js"
+import { isTruthy } from "../../../util/util.js"
+import { styles } from "../../../logger/styles.js"
+import type { ActionLog } from "../../../logger/log-entry.js"
+
+type WrappedInstallError = { source: "helm" | "waitForResources"; error: unknown }
+
+function isWrappedInstallError(error: unknown): error is WrappedInstallError {
+  return (
+    typeof error === "object" &&
+    error !== null &&
+    "error" in error &&
+    "source" in error &&
+    (error.source === "helm" || error.source === "waitForResources")
+  )
+}
+
+function isErrorWithMessage(error: unknown): error is { message: string } {
+  return typeof error === "object" && error !== null && "message" in error
+}
+
+async function getUnhealthyResourceLogs({
+  namespace,
+  log,
+  manifests,
+  api,
+}: {
+  namespace: string
+  log: ActionLog
+  manifests: KubernetesResource[]
+  api: KubeApi
+}): Promise<string | null> {
+  const unhealthyResources = (await checkResourceStatuses({ api, namespace, manifests, log })).filter(
+    (r) => r.state === "unhealthy"
+  )
+  const logsArr = unhealthyResources.map((r) => r.logs).filter(isTruthy)
+
+  if (logsArr.length === 0) {
+    return null
+  }
+
+  return logsArr.join("\n\n")
+}
 
 export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async (params) => {
   const { ctx, action, log, force } = params
@@ -63,43 +105,125 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
   ]
 
   if (spec.atomic) {
-    // Make sure chart gets purged if it fails to install. Note: --atomic implies --wait.
+    // This options means that the chart gets purged if it fails to install
     commonArgs.push("--atomic")
   }
 
-  if (releaseStatus.state === "missing") {
-    log.silly(() => `Installing Helm release ${releaseName}`)
-    const installArgs = ["install", releaseName, ...reference, ...commonArgs]
+  let helmArgs: string[]
+  const shouldInstall = releaseStatus.state === "missing"
+  if (shouldInstall) {
+    helmArgs = ["install", releaseName, ...reference, ...commonArgs]
     if (force && !ctx.production) {
-      installArgs.push("--replace")
+      helmArgs.push("--replace")
     }
-    await helm({ ctx: k8sCtx, namespace, log, args: [...installArgs], emitLogEvents: true })
   } else {
-    log.silly(() => `Upgrading Helm release ${releaseName}`)
-    const upgradeArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
-    await helm({ ctx: k8sCtx, namespace, log, args: [...upgradeArgs], emitLogEvents: true })
-
-    // If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
-    // that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
-    // dangling annotations created by Garden Cloud.
-    if (ctx.cloudApi) {
-      try {
-        const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
-        await Promise.all(
-          pausedResources.map((resource) => {
-            const { annotations } = resource.metadata
-            if (annotations) {
-              delete annotations[gardenCloudAECPauseAnnotation]
-              return api.annotateResource({ log, resource, annotations })
-            }
-            return
-          })
-        )
-      } catch (error) {
-        const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
-        log.warn(errorMsg)
-        log.debug({ error: toGardenError(error) })
-      }
+    helmArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
+  }
+
+  const preparedManifests = await prepareManifests({
+    ctx: k8sCtx,
+    log,
+    action,
+    ...preparedTemplates,
+  })
+  const manifests = await filterManifests(preparedManifests)
+
+  // We never fail fast with --atomic
+  const failFast = spec.atomic === false && spec.waitForUnhealthyResources === false
+  let wrappedInstallError: unknown | null = null
+  // This is basically an internal field that's only used for testing. Couldn't think of a better approach -E
+  let helmCommandSuccessful = false
+  const helmPromise = helm({ ctx: k8sCtx, namespace, log, args: [...helmArgs], emitLogEvents: true })
+    .then(() => {
+      helmCommandSuccessful = true
+    })
+    .catch((error) => {
+      throw { source: "helm", error }
+    })
+
+  log.debug(() => `${shouldInstall ? "Installing" : "Upgrading"} Helm release ${releaseName}`)
+  if (failFast) {
+    // In this case we use Garden's resource monitoring and fail fast if one of the resources being installed is unhealthy.
+    log.silly(() => `Will fail fast if Helm resources are unhealthy`)
+    const waitForResourcesPromise = waitForResources({
+      namespace,
+      ctx: k8sCtx,
+      provider: k8sCtx.provider,
+      actionName: action.key(),
+      resources: manifests,
+      log,
+      timeoutSec: action.getConfig("timeout"),
+    }).catch((error) => {
+      throw { source: "waitForResources", error }
+    })
+
+    // Wait for either the first error or Helm completion
+    try {
+      await Promise.race([
+        // Wait for helm to complete
+        helmPromise,
+        // If either throws, this will reject
+        Promise.all([helmPromise, waitForResourcesPromise]),
+      ])
+    } catch (err) {
+      wrappedInstallError = err
+    }
+  } else {
+    // In this case we don't monitor the resources and simply let the Helm command run until completion
+    log.silly(() => `Will not fail fast if Helm resources are unhealthy but wait for Helm to complete`)
+    try {
+      await helmPromise
+    } catch (err) {
+      wrappedInstallError = err
+    }
+  }
+
+  if (wrappedInstallError) {
+    if (!isWrappedInstallError(wrappedInstallError)) {
+      throw wrappedInstallError
+    }
+
+    const error = wrappedInstallError.error
+
+    // If it's a direct Helm error we try get the logs and events for the resources and add them to the error message
+    // unless --atomic=true because in that case the events and logs won't be available after the roll back.
+    // If it's an error from the resource monitoring it will already contain the logs and events.
+    if (wrappedInstallError.source === "helm" && !spec.atomic && isErrorWithMessage(error)) {
+      const logs = await getUnhealthyResourceLogs({
+        namespace,
+        log,
+        manifests,
+        api,
+      })
+      error.message += styles.primary(
+        `\n\nFound unhealthy resources for release ${styles.accent(releaseName)}. Below are Kubernetes events and (if applicable) Pod logs from the unhealthy resources.\n\n`
+      )
+      error.message += logs
+    }
+
+    throw error
+  }
+
+  // If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
+  // that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
+  // dangling annotations created by Garden Cloud.
+  if (ctx.cloudApi) {
+    try {
+      const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
+      await Promise.all(
+        pausedResources.map((resource) => {
+          const { annotations } = resource.metadata
+          if (annotations) {
+            delete annotations[gardenCloudAECPauseAnnotation]
+            return api.annotateResource({ log, resource, annotations })
+          }
+          return
+        })
+      )
+    } catch (error) {
+      const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
+      log.warn(errorMsg)
+      log.debug({ error: toGardenError(error) })
     }
   }
 
@@ -119,14 +243,6 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
     data: gardenMetadata,
   })
 
-  const preparedManifests = await prepareManifests({
-    ctx: k8sCtx,
-    log,
-    action,
-    ...preparedTemplates,
-  })
-  const manifests = await filterManifests(preparedManifests)
-
   const mode = action.mode()
 
   // Because we need to modify the Deployment, and because there is currently no reliable way to do that before
@@ -205,7 +321,7 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
       state: "ready",
       version: action.versionString(),
       ingresses,
-      detail: { remoteResources: statuses.map((s) => s.resource) },
+      detail: { remoteResources: statuses.map((s) => s.resource), helmCommandSuccessful },
     },
     attached,
     // TODO-0.13.1
diff --git a/core/src/plugins/kubernetes/helm/handlers.ts b/core/src/plugins/kubernetes/helm/handlers.ts
index 55aa7a6118..9d9e0eb2f9 100644
--- a/core/src/plugins/kubernetes/helm/handlers.ts
+++ b/core/src/plugins/kubernetes/helm/handlers.ts
@@ -187,6 +187,8 @@ function prepareDeployAction({
 
     spec: {
       atomic: module.spec.atomicInstall,
+      // This option is not available on Modules so we default to false when converting from modules
+      waitForUnhealthyResources: false,
       portForwards: module.spec.portForwards,
       namespace: module.spec.namespace,
       releaseName: module.spec.releaseName || module.name,
diff --git a/core/src/plugins/kubernetes/helm/helm-cli.ts b/core/src/plugins/kubernetes/helm/helm-cli.ts
index 1f0efc08d5..1ee3b4f996 100644
--- a/core/src/plugins/kubernetes/helm/helm-cli.ts
+++ b/core/src/plugins/kubernetes/helm/helm-cli.ts
@@ -117,7 +117,9 @@ export async function helm({
   }
 
   const outputStream = split2()
-  outputStream.on("error", () => {})
+  outputStream.on("error", () => {
+    // Do nothing
+  })
   outputStream.on("data", (line: Buffer) => {
     if (emitLogEvents) {
       ctx.events.emit("log", { timestamp: new Date().toISOString(), msg: line.toString(), ...logEventContext })
diff --git a/core/src/plugins/kubernetes/kubernetes-type/handlers.ts b/core/src/plugins/kubernetes/kubernetes-type/handlers.ts
index 8738d0c555..c0e7984541 100644
--- a/core/src/plugins/kubernetes/kubernetes-type/handlers.ts
+++ b/core/src/plugins/kubernetes/kubernetes-type/handlers.ts
@@ -430,7 +430,6 @@ export const kubernetesDeploy: DeployActionHandler<"deploy", KubernetesDeployAct
       waitForJobs: spec.waitForJobs,
     })
   }
-
   const status = await getKubernetesDeployStatus(<any>params)
 
   // Make sure port forwards work after redeployment
diff --git a/core/src/plugins/kubernetes/status/status.ts b/core/src/plugins/kubernetes/status/status.ts
index 272b0ccbfb..c4d00b19ff 100644
--- a/core/src/plugins/kubernetes/status/status.ts
+++ b/core/src/plugins/kubernetes/status/status.ts
@@ -268,6 +268,9 @@ interface WaitParams {
 
 /**
  * Wait until the rollout is complete for each of the given Kubernetes objects
+ *
+ * @throws {DeploymentResourceStatusError} as soon as resource with state="unhealthy" is found
+ * @throws {DeploymentError} if it times out waiting for resource
  */
 export async function waitForResources({
   namespace,
diff --git a/core/src/plugins/kubernetes/status/workload.ts b/core/src/plugins/kubernetes/status/workload.ts
index 07f9a6d2ab..a8936fca98 100644
--- a/core/src/plugins/kubernetes/status/workload.ts
+++ b/core/src/plugins/kubernetes/status/workload.ts
@@ -113,17 +113,15 @@ export async function checkWorkloadStatus({ api, namespace, resource }: StatusHa
       podLogs = null
     }
 
-    logs += styles.accent(
-      `\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
-    )
     if (podLogs) {
+      logs += styles.accent(
+        `\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
+      )
       logs += podLogs
       logs += styles.primary(dedent`
         \n💡 Garden hint: For complete Pod logs for this ${workload.kind}, run the following command:
         ${styles.command(`kubectl -n ${namespace} --context=${api.context} logs ${workload.kind.toLowerCase()}/${workload.metadata.name} --all-containers`)}
       `)
-    } else {
-      logs += "<No Pod logs found>"
     }
 
     return <ResourceStatus>{
diff --git a/core/src/plugins/kubernetes/util.ts b/core/src/plugins/kubernetes/util.ts
index 1b8918022b..e968ad5add 100644
--- a/core/src/plugins/kubernetes/util.ts
+++ b/core/src/plugins/kubernetes/util.ts
@@ -463,8 +463,8 @@ export async function upsertConfigMap({
  * becomes
  * `[{ metadata: { name: a }}, { metadata: { name: b }}, { metadata: { name: b }}]`
  */
-export function flattenResources(resources: KubernetesResource[]) {
-  return flatten(resources.map((r: any) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
+export function flattenResources(resources: KubernetesResource[]): KubernetesResource[] {
+  return flatten(resources.map((r: KubernetesResource) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
 }
 
 /**
diff --git a/core/test/data/test-projects/helm/api/garden.yml b/core/test/data/test-projects/helm/api/garden.yml
index d7d716520b..bed18d82cf 100644
--- a/core/test/data/test-projects/helm/api/garden.yml
+++ b/core/test/data/test-projects/helm/api/garden.yml
@@ -13,6 +13,7 @@ spec:
     kind: Deployment
     name: api-release
   values:
+    args: [python, app.py]
     image:
       repository: api-image
       tag: ${actions.build.api-image.version}
@@ -20,9 +21,7 @@ spec:
       enabled: true
       paths: [/]
       hosts: [api.local.demo.garden]
-
 ---
-
 kind: Module
 description: The API backend for the voting UI
 type: helm
@@ -30,8 +29,8 @@ name: api-module
 releaseName: api-module-release
 devMode:
   sync:
-  - target: /app
-    mode: two-way
+    - target: /app
+      mode: two-way
 serviceResource:
   kind: Deployment
   containerModule: api-image
diff --git a/core/test/data/test-projects/helm/api/templates/deployment.yaml b/core/test/data/test-projects/helm/api/templates/deployment.yaml
index 5d3d5616e7..e486cf6e0c 100644
--- a/core/test/data/test-projects/helm/api/templates/deployment.yaml
+++ b/core/test/data/test-projects/helm/api/templates/deployment.yaml
@@ -24,7 +24,8 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
-          args: [python, app.py]
+          args:
+            {{- toYaml .Values.args | nindent 12 }}
           ports:
             - name: http
               containerPort: 80
diff --git a/core/test/data/test-projects/helm/api/values.yaml b/core/test/data/test-projects/helm/api/values.yaml
index 097dc7157d..f302618dfd 100644
--- a/core/test/data/test-projects/helm/api/values.yaml
+++ b/core/test/data/test-projects/helm/api/values.yaml
@@ -29,6 +29,8 @@ ingress:
   #    hosts:
   #      - chart-example.local
 
+args: []
+
 resources: {}
   # We usually recommend not to specify default resources and to leave this as a conscious
   # choice for the user. This also increases chances charts run on environments with little
diff --git a/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml b/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml
index dd5eeee6ff..ef98aff3ce 100644
--- a/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml
+++ b/core/test/data/test-projects/kubernetes-type/with-build-action/with-build.garden.yml
@@ -3,4 +3,4 @@ type: kubernetes
 name: with-build-action
 build: exec-build
 spec:
-  files: [ "*.yaml" ]
+  files: ["*.yaml"]
diff --git a/core/test/integ/src/plugins/kubernetes/helm/deployment.ts b/core/test/integ/src/plugins/kubernetes/helm/deployment.ts
index 27d0d81c3b..8aa2b395a0 100644
--- a/core/test/integ/src/plugins/kubernetes/helm/deployment.ts
+++ b/core/test/integ/src/plugins/kubernetes/helm/deployment.ts
@@ -30,6 +30,9 @@ import { createActionLog } from "../../../../../../src/logger/log-entry.js"
 import type { NamespaceStatus } from "../../../../../../src/types/namespace.js"
 import { FakeCloudApi } from "../../../../../helpers/api.js"
 import { getActionNamespace } from "../../../../../../src/plugins/kubernetes/namespace.js"
+import stripAnsi from "strip-ansi"
+import { randomString } from "../../../../../../src/util/string.js"
+import { ChildProcessError, DeploymentError } from "../../../../../../src/exceptions.js"
 
 describe("helmDeploy in local-mode", () => {
   let garden: TestGarden
@@ -455,4 +458,213 @@ describe("helmDeploy", () => {
     })
     expect(releaseStatusAfterScaleDown.state).to.equal("outdated")
   })
+
+  it("should include K8s events and Pod logs with errors", async () => {
+    graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+    const action = graph.getDeploy("api") as HelmDeployAction
+    action._config.spec.values["args"] = ["/bin/sh", "-c", "echo 'hello' && exit 1"]
+    action._config.spec.values["ingress"]!["enabled"] = false
+    action._config.spec.releaseName = `api-${randomString(4)}`
+    const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+      action,
+      log: garden.log,
+      graph,
+    })
+    const actionLog = createActionLog({
+      log: garden.log,
+      actionName: resolvedAction.name,
+      actionKind: resolvedAction.kind,
+    })
+
+    const releaseName = getReleaseName(resolvedAction)
+    await expectError(
+      () =>
+        helmDeploy({
+          ctx,
+          log: actionLog,
+          action: resolvedAction,
+          force: false,
+        }),
+      (err) => {
+        const message = stripAnsi(err.message)
+        expect(message).to.include(`Latest events from Deployment ${releaseName}`)
+        expect(message).to.include(`BackOff`)
+        expect(message).to.include(`Latest logs from failed containers in each Pod in Deployment ${releaseName}`)
+        expect(message).to.match(/api-.+\/api: hello/)
+      }
+    )
+  })
+  it("should fail fast if one of the resources is unhealthy", async () => {
+    graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+    const action = graph.getDeploy("api") as HelmDeployAction
+    action._config.spec.values["args"] = ["/bin/sh", "-c", "echo 'hello' && exit 1"]
+    action._config.spec.values["ingress"]!["enabled"] = false
+    action._config.spec.releaseName = `api-${randomString(4)}`
+    const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+      action,
+      log: garden.log,
+      graph,
+    })
+    const actionLog = createActionLog({
+      log: garden.log,
+      actionName: resolvedAction.name,
+      actionKind: resolvedAction.kind,
+    })
+
+    await expectError(
+      () =>
+        helmDeploy({
+          ctx,
+          log: actionLog,
+          action: resolvedAction,
+          force: false,
+        }),
+      (err) => {
+        expect(err).to.be.instanceOf(DeploymentError)
+      }
+    )
+  })
+  it("should wait for the Helm command to complete if there are no resource errors", async () => {
+    graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+    const action = graph.getDeploy("api") as HelmDeployAction
+    const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+      action,
+      log: garden.log,
+      graph,
+    })
+    const actionLog = createActionLog({
+      log: garden.log,
+      actionName: resolvedAction.name,
+      actionKind: resolvedAction.kind,
+    })
+
+    const res = await helmDeploy({
+      ctx,
+      log: actionLog,
+      action: resolvedAction,
+      force: false,
+    })
+
+    expect(res.detail?.detail.helmCommandSuccessful).to.eql(true)
+  })
+  context("atomic=true", () => {
+    it("should NOT fail fast if one of the resources is unhealthy but wait for the Helm command to complete", async () => {
+      graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+      const action = graph.getDeploy("api") as HelmDeployAction
+      action._config.timeout = 5
+      action._config.spec.atomic = true
+      action._config.spec.values = {
+        ...action._config.spec.values,
+        args: ["/bin/sh", "-c", "echo 'hello' && exit 1"],
+      }
+      const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+        action,
+        log: garden.log,
+        graph,
+      })
+      const actionLog = createActionLog({
+        log: garden.log,
+        actionName: resolvedAction.name,
+        actionKind: resolvedAction.kind,
+      })
+
+      const releaseName = getReleaseName(resolvedAction)
+      await expectError(
+        () =>
+          helmDeploy({
+            ctx,
+            log: actionLog,
+            action: resolvedAction,
+            force: false,
+          }),
+        (err) => {
+          const message = stripAnsi(err.message)
+          expect(err).to.be.instanceOf(ChildProcessError)
+          expect(message).to.include(`release ${releaseName} failed`)
+          expect(message).to.include(`due to atomic being set`)
+        }
+      )
+    })
+  })
+  context("waitForUnhealthyResources=true", () => {
+    it("should include K8s events and Pod logs with errors", async () => {
+      graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+      const action = graph.getDeploy("api") as HelmDeployAction
+      action._config.timeout = 5
+      action._config.spec.waitForUnhealthyResources = true
+      action._config.spec.values = {
+        ...action._config.spec.values,
+        args: ["/bin/sh", "-c", "echo 'hello' && exit 1"],
+      }
+      const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+        action,
+        log: garden.log,
+        graph,
+      })
+      const actionLog = createActionLog({
+        log: garden.log,
+        actionName: resolvedAction.name,
+        actionKind: resolvedAction.kind,
+      })
+
+      const releaseName = getReleaseName(resolvedAction)
+      await expectError(
+        () =>
+          helmDeploy({
+            ctx,
+            log: actionLog,
+            action: resolvedAction,
+            force: false,
+          }),
+        (err) => {
+          const message = stripAnsi(err.message)
+          expect(message).to.include(`Latest events from Deployment ${releaseName}`)
+          expect(message).to.include(`BackOff`)
+          expect(message).to.include(`Latest logs from failed containers in each Pod in Deployment ${releaseName}`)
+          expect(message).to.match(/api-release-.+\/api: hello/)
+        }
+      )
+    })
+    it("should NOT fail fast if one of the resources is unhealthy but wait for the Helm command to complete", async () => {
+      graph = await garden.getConfigGraph({ log: garden.log, emit: false })
+
+      const action = graph.getDeploy("api") as HelmDeployAction
+      action._config.timeout = 5
+      action._config.spec.waitForUnhealthyResources = true
+      action._config.spec.values = {
+        ...action._config.spec.values,
+        args: ["/bin/sh", "-c", "echo 'hello' && exit 1"],
+      }
+      const resolvedAction = await garden.resolveAction<HelmDeployAction>({
+        action,
+        log: garden.log,
+        graph,
+      })
+      const actionLog = createActionLog({
+        log: garden.log,
+        actionName: resolvedAction.name,
+        actionKind: resolvedAction.kind,
+      })
+
+      await expectError(
+        () =>
+          helmDeploy({
+            ctx,
+            log: actionLog,
+            action: resolvedAction,
+            force: false,
+          }),
+        (err) => {
+          const message = stripAnsi(err.message)
+          expect(err).to.be.instanceOf(ChildProcessError)
+          expect(message).to.include(`Error: UPGRADE FAILED`)
+        }
+      )
+    })
+  })
 })
diff --git a/docs/reference/action-types/Deploy/helm.md b/docs/reference/action-types/Deploy/helm.md
index b568e04082..f0860b7b2f 100644
--- a/docs/reference/action-types/Deploy/helm.md
+++ b/docs/reference/action-types/Deploy/helm.md
@@ -389,7 +389,29 @@ this action config's directory.
 
 [spec](#spec) > atomic
 
-Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll back, instead of having Helm do it automatically on failure.
+Whether to set the `--atomic` flag during installs and upgrades. Set to `true` if you'd like the changes applied
+to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
+roll back, instead of having Helm do it automatically on failure.
+
+Note that setting `atomic` to `true` implies `wait`.
+
+| Type      | Default | Required |
+| --------- | ------- | -------- |
+| `boolean` | `false` | No       |
+
+### `spec.waitForUnhealthyResources`
+
+[spec](#spec) > waitForUnhealthyResources
+
+Whether to wait for the Helm command to complete before throwing an error if one of the resources being installed/upgraded is unhealthy.
+
+By default, Garden will monitor the resources being created by Helm and throw an error as soon as one of them is unhealthy. This allows Garden to fail fast if there's an issue with one of the resources. If no issue is detected, Garden waits for the Helm command to complete.
+
+If however `waitForUnhealthyResources` is set to `true` and some resources are unhealthy, then Garden will wait for Helm itself to throw an error which typically happens when it times out in the case of unhealthy resources (e.g. due to `ImagePullBackOff` or `CrashLoopBackOff` errors).
+
+Waiting for the timeout can take awhile so using the default value here is recommended unless you'd like to completely mimic Helm's behaviour and not rely on Garden's resource monitoring.
+
+Note that setting `atomic` to `true` implies `waitForUnhealthyResources`.
 
 | Type      | Default | Required |
 | --------- | ------- | -------- |
diff --git a/docs/reference/module-types/helm.md b/docs/reference/module-types/helm.md
index 5fe7d8fdac..aae5cfc08c 100644
--- a/docs/reference/module-types/helm.md
+++ b/docs/reference/module-types/helm.md
@@ -219,9 +219,11 @@ values: {}
 # this action config's directory.
 valueFiles: []
 
-# Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be
-# reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll
-# back, instead of having Helm do it automatically on failure.
+# Whether to set the `--atomic` flag during installs and upgrades. Set to `true` if you'd like the changes applied
+# to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
+# roll back, instead of having Helm do it automatically on failure.
+#
+# Note that setting `atomic` to `true` implies `wait`.
 atomicInstall: false
 
 # The name of another `helm` module to use as a base for this one. Use this to re-use a Helm chart across multiple
@@ -1044,7 +1046,11 @@ this action config's directory.
 
 ### `atomicInstall`
 
-Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll back, instead of having Helm do it automatically on failure.
+Whether to set the `--atomic` flag during installs and upgrades. Set to `true` if you'd like the changes applied
+to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
+roll back, instead of having Helm do it automatically on failure.
+
+Note that setting `atomic` to `true` implies `wait`.
 
 | Type      | Default | Required |
 | --------- | ------- | -------- |
diff --git a/examples/vote-helm/api.garden.yml b/examples/vote-helm/api.garden.yml
index 0bf9c4ff29..fe6d747b02 100644
--- a/examples/vote-helm/api.garden.yml
+++ b/examples/vote-helm/api.garden.yml
@@ -3,6 +3,7 @@ description: The API backend for the voting UI
 type: helm
 name: api
 dependencies:
+  - build.api-image
   - deploy.redis
 variables:
   repository: ${actions.build.api-image.outputs.deployment-image-name}