Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(k8s): show Helm events and logs #6626

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion core/src/plugins/kubernetes/helm/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ interface HelmChartSpec {

interface HelmDeployActionSpec {
atomic: boolean
waitForUnhealthyResources: boolean
chart?: HelmChartSpec
defaultTarget?: KubernetesTargetResourceSpec
sync?: KubernetesDeploySyncSpec
Expand Down Expand Up @@ -166,14 +167,31 @@ const helmChartSpecSchema = () =>
)

export const defaultHelmAtomicFlag = false
export const defaultHelmAtomicFlagDesc = `Whether to set the --atomic flag during installs and upgrades. Set to true if you'd like the changes applied to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually roll back, instead of having Helm do it automatically on failure.`
export const defaultHelmAtomicFlagDesc = dedent`
Whether to set the \`--atomic\` flag during installs and upgrades. Set to \`true\` if you'd like the changes applied
to be reverted on failure. Set to false if e.g. you want to see more information about failures and then manually
roll back, instead of having Helm do it automatically on failure.

Note that setting \`atomic\` to \`true\` implies \`wait\`.
`

export const helmDeploySchema = () =>
joi
.object()
.keys({
...helmCommonSchemaKeys(),
atomic: joi.boolean().default(defaultHelmAtomicFlag).description(defaultHelmAtomicFlagDesc),
waitForUnhealthyResources: joi.boolean().default(false).description(dedent`
Whether to wait for the Helm command to complete before throwing an error if one of the resources being installed/upgraded is unhealthy.

By default, Garden will monitor the resources being created by Helm and throw an error as soon as one of them is unhealthy. This allows Garden to fail fast if there's an issue with one of the resources. If no issue is detected, Garden waits for the Helm command to complete.

If however \`waitForUnhealthyResources\` is set to \`true\` and some resources are unhealthy, then Garden will wait for Helm itself to throw an error which typically happens when it times out in the case of unhealthy resources (e.g. due to \`ImagePullBackOff\` or \`CrashLoopBackOff\` errors).

Waiting for the timeout can take awhile so using the default value here is recommended unless you'd like to completely mimic Helm's behaviour and not rely on Garden's resource monitoring.

Note that setting \`atomic\` to \`true\` implies \`waitForUnhealthyResources\`.
`),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great explanation!

chart: helmChartSpecSchema(),
defaultTarget: defaultTargetSchema(),
sync: kubernetesDeploySyncSchema(),
Expand Down
198 changes: 157 additions & 41 deletions core/src/plugins/kubernetes/helm/deployment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,49 @@ import { isEmpty } from "lodash-es"
import { getK8sIngresses } from "../status/ingress.js"
import { toGardenError } from "../../../exceptions.js"
import { upsertConfigMap } from "../util.js"
import type { SyncableResource } from "../types.js"
import type { KubernetesResource, SyncableResource } from "../types.js"
import { isTruthy } from "../../../util/util.js"
import { styles } from "../../../logger/styles.js"
import type { ActionLog } from "../../../logger/log-entry.js"

type WrappedInstallError = { source: "helm" | "waitForResources"; error: unknown }

function isWrappedInstallError(error: unknown): error is WrappedInstallError {
return (
typeof error === "object" &&
error !== null &&
"error" in error &&
"source" in error &&
(error.source === "helm" || error.source === "waitForResources")
)
}

function isErrorWithMessage(error: unknown): error is { message: string } {
return typeof error === "object" && error !== null && "message" in error
}

async function getUnhealthyResourceLogs({
namespace,
log,
manifests,
api,
}: {
namespace: string
log: ActionLog
manifests: KubernetesResource[]
api: KubeApi
}): Promise<string | null> {
const unhealthyResources = (await checkResourceStatuses({ api, namespace, manifests, log })).filter(
(r) => r.state === "unhealthy"
)
const logsArr = unhealthyResources.map((r) => r.logs).filter(isTruthy)

if (logsArr.length === 0) {
return null
}

return logsArr.join("\n\n")
}

export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async (params) => {
const { ctx, action, log, force } = params
Expand Down Expand Up @@ -63,43 +105,125 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
]

if (spec.atomic) {
// Make sure chart gets purged if it fails to install. Note: --atomic implies --wait.
// This options means that the chart gets purged if it fails to install
commonArgs.push("--atomic")
}

if (releaseStatus.state === "missing") {
log.silly(() => `Installing Helm release ${releaseName}`)
const installArgs = ["install", releaseName, ...reference, ...commonArgs]
let helmArgs: string[]
const shouldInstall = releaseStatus.state === "missing"
if (shouldInstall) {
helmArgs = ["install", releaseName, ...reference, ...commonArgs]
if (force && !ctx.production) {
installArgs.push("--replace")
helmArgs.push("--replace")
}
await helm({ ctx: k8sCtx, namespace, log, args: [...installArgs], emitLogEvents: true })
} else {
log.silly(() => `Upgrading Helm release ${releaseName}`)
const upgradeArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
await helm({ ctx: k8sCtx, namespace, log, args: [...upgradeArgs], emitLogEvents: true })

// If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
// that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
// dangling annotations created by Garden Cloud.
if (ctx.cloudApi) {
try {
const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
await Promise.all(
pausedResources.map((resource) => {
const { annotations } = resource.metadata
if (annotations) {
delete annotations[gardenCloudAECPauseAnnotation]
return api.annotateResource({ log, resource, annotations })
}
return
})
)
} catch (error) {
const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
log.warn(errorMsg)
log.debug({ error: toGardenError(error) })
}
helmArgs = ["upgrade", releaseName, ...reference, "--install", ...commonArgs]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on always using upgrade --install

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we also did this before, so know change here.

}

const preparedManifests = await prepareManifests({
ctx: k8sCtx,
log,
action,
...preparedTemplates,
})
const manifests = await filterManifests(preparedManifests)

// We never fail fast with --atomic
const failFast = spec.atomic === false && spec.waitForUnhealthyResources === false
let wrappedInstallError: unknown | null = null
// This is basically an internal field that's only used for testing. Couldn't think of a better approach -E
let helmCommandSuccessful = false
const helmPromise = helm({ ctx: k8sCtx, namespace, log, args: [...helmArgs], emitLogEvents: true })
.then(() => {
helmCommandSuccessful = true
})
.catch((error) => {
throw { source: "helm", error }
})

log.debug(() => `${shouldInstall ? "Installing" : "Upgrading"} Helm release ${releaseName}`)
if (failFast) {
// In this case we use Garden's resource monitoring and fail fast if one of the resources being installed is unhealthy.
log.silly(() => `Will fail fast if Helm resources are unhealthy`)
const waitForResourcesPromise = waitForResources({
namespace,
ctx: k8sCtx,
provider: k8sCtx.provider,
actionName: action.key(),
resources: manifests,
log,
timeoutSec: action.getConfig("timeout"),
}).catch((error) => {
throw { source: "waitForResources", error }
})

// Wait for either the first error or Helm completion
try {
await Promise.race([
// Wait for helm to complete
helmPromise,
// If either throws, this will reject
Promise.all([helmPromise, waitForResourcesPromise]),
])
} catch (err) {
wrappedInstallError = err
}
} else {
// In this case we don't monitor the resources and simply let the Helm command run until completion
log.silly(() => `Will not fail fast if Helm resources are unhealthy but wait for Helm to complete`)
try {
await helmPromise
} catch (err) {
wrappedInstallError = err
}
}

if (wrappedInstallError) {
if (!isWrappedInstallError(wrappedInstallError)) {
throw wrappedInstallError
}

const error = wrappedInstallError.error

// If it's a direct Helm error we try get the logs and events for the resources and add them to the error message
// unless --atomic=true because in that case the events and logs won't be available after the roll back.
// If it's an error from the resource monitoring it will already contain the logs and events.
if (wrappedInstallError.source === "helm" && !spec.atomic && isErrorWithMessage(error)) {
const logs = await getUnhealthyResourceLogs({
namespace,
log,
manifests,
api,
})
error.message += styles.primary(
`\n\nFound unhealthy resources for release ${styles.accent(releaseName)}. Below are Kubernetes events and (if applicable) Pod logs from the unhealthy resources.\n\n`
)
error.message += logs
}

throw error
}

// If ctx.cloudApi is defined, the user is logged in and they might be trying to deploy to an environment
// that could have been paused by Garden Cloud's AEC functionality. We therefore make sure to clean up any
// dangling annotations created by Garden Cloud.
if (ctx.cloudApi) {
try {
const pausedResources = await getPausedResources({ ctx: k8sCtx, action, namespace, releaseName, log })
await Promise.all(
pausedResources.map((resource) => {
const { annotations } = resource.metadata
if (annotations) {
delete annotations[gardenCloudAECPauseAnnotation]
return api.annotateResource({ log, resource, annotations })
}
return
})
)
} catch (error) {
const errorMsg = `Failed to remove Garden Cloud AEC annotations for deploy: ${action.name}.`
log.warn(errorMsg)
log.debug({ error: toGardenError(error) })
}
}

Expand All @@ -119,14 +243,6 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
data: gardenMetadata,
})

const preparedManifests = await prepareManifests({
ctx: k8sCtx,
log,
action,
...preparedTemplates,
})
const manifests = await filterManifests(preparedManifests)

const mode = action.mode()

// Because we need to modify the Deployment, and because there is currently no reliable way to do that before
Expand Down Expand Up @@ -205,7 +321,7 @@ export const helmDeploy: DeployActionHandler<"deploy", HelmDeployAction> = async
state: "ready",
version: action.versionString(),
ingresses,
detail: { remoteResources: statuses.map((s) => s.resource) },
detail: { remoteResources: statuses.map((s) => s.resource), helmCommandSuccessful },
},
attached,
// TODO-0.13.1
Expand Down
2 changes: 2 additions & 0 deletions core/src/plugins/kubernetes/helm/handlers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ function prepareDeployAction({

spec: {
atomic: module.spec.atomicInstall,
// This option is not available on Modules so we default to false when converting from modules
waitForUnhealthyResources: false,
portForwards: module.spec.portForwards,
namespace: module.spec.namespace,
releaseName: module.spec.releaseName || module.name,
Expand Down
4 changes: 3 additions & 1 deletion core/src/plugins/kubernetes/helm/helm-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ export async function helm({
}

const outputStream = split2()
outputStream.on("error", () => {})
outputStream.on("error", () => {
// Do nothing
})
outputStream.on("data", (line: Buffer) => {
if (emitLogEvents) {
ctx.events.emit("log", { timestamp: new Date().toISOString(), msg: line.toString(), ...logEventContext })
Expand Down
1 change: 0 additions & 1 deletion core/src/plugins/kubernetes/kubernetes-type/handlers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,6 @@ export const kubernetesDeploy: DeployActionHandler<"deploy", KubernetesDeployAct
waitForJobs: spec.waitForJobs,
})
}

const status = await getKubernetesDeployStatus(<any>params)

// Make sure port forwards work after redeployment
Expand Down
3 changes: 3 additions & 0 deletions core/src/plugins/kubernetes/status/status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ interface WaitParams {

/**
* Wait until the rollout is complete for each of the given Kubernetes objects
*
* @throws {DeploymentResourceStatusError} as soon as resource with state="unhealthy" is found
* @throws {DeploymentError} if it times out waiting for resource
*/
export async function waitForResources({
namespace,
Expand Down
8 changes: 3 additions & 5 deletions core/src/plugins/kubernetes/status/workload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,15 @@ export async function checkWorkloadStatus({ api, namespace, resource }: StatusHa
podLogs = null
}

logs += styles.accent(
`\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
)
if (podLogs) {
logs += styles.accent(
`\n\n━━━ Latest logs from failed containers in each Pod in ${workload.kind} ${workload.metadata.name} ━━━\n`
)
logs += podLogs
logs += styles.primary(dedent`
\n💡 Garden hint: For complete Pod logs for this ${workload.kind}, run the following command:
${styles.command(`kubectl -n ${namespace} --context=${api.context} logs ${workload.kind.toLowerCase()}/${workload.metadata.name} --all-containers`)}
`)
} else {
logs += "<No Pod logs found>"
}

return <ResourceStatus>{
Expand Down
4 changes: 2 additions & 2 deletions core/src/plugins/kubernetes/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ export async function upsertConfigMap({
* becomes
* `[{ metadata: { name: a }}, { metadata: { name: b }}, { metadata: { name: b }}]`
*/
export function flattenResources(resources: KubernetesResource[]) {
return flatten(resources.map((r: any) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
export function flattenResources(resources: KubernetesResource[]): KubernetesResource[] {
return flatten(resources.map((r: KubernetesResource) => (r.apiVersion === "v1" && r.kind === "List" ? r.items : [r])))
}

/**
Expand Down
7 changes: 3 additions & 4 deletions core/test/data/test-projects/helm/api/garden.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,24 @@ spec:
kind: Deployment
name: api-release
values:
args: [python, app.py]
image:
repository: api-image
tag: ${actions.build.api-image.version}
ingress:
enabled: true
paths: [/]
hosts: [api.local.demo.garden]

---

kind: Module
description: The API backend for the voting UI
type: helm
name: api-module
releaseName: api-module-release
devMode:
sync:
- target: /app
mode: two-way
- target: /app
mode: two-way
serviceResource:
kind: Deployment
containerModule: api-image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ spec:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args: [python, app.py]
args:
{{- toYaml .Values.args | nindent 12 }}
ports:
- name: http
containerPort: 80
Expand Down
2 changes: 2 additions & 0 deletions core/test/data/test-projects/helm/api/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ ingress:
# hosts:
# - chart-example.local

args: []

resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ type: kubernetes
name: with-build-action
build: exec-build
spec:
files: [ "*.yaml" ]
files: ["*.yaml"]
Loading