Skip to content

Commit

Permalink
fix(core): ensure pod runner throws when container is OOMKilled
Browse files Browse the repository at this point in the history
  • Loading branch information
eysi09 authored and edvald committed May 10, 2021
1 parent 4b61951 commit 9dd044a
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 31 deletions.
4 changes: 4 additions & 0 deletions core/src/exceptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ export class TimeoutError extends GardenBaseError {
type = "timeout"
}

export class OutOfMemoryError extends GardenBaseError {
type = "out-of-memory"
}

export class NotFoundError extends GardenBaseError {
type = "not-found"
}
Expand Down
80 changes: 51 additions & 29 deletions core/src/plugins/kubernetes/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
RuntimeError,
ConfigurationError,
ParameterError,
OutOfMemoryError,
} from "../../exceptions"
import { KubernetesProvider } from "./config"
import { Writable, Readable } from "stream"
Expand Down Expand Up @@ -333,18 +334,6 @@ async function runWithoutArtifacts({
let result: RunResult
const startedAt = new Date()

const timedOutResult = (logs: string) => {
return {
command: runner.getFullCommand(),
completedAt: new Date(),
log: "Command timed out." + (logs ? ` Here are the logs until the timeout occurred:\n\n${logs}` : ""),
moduleName: module.name,
startedAt,
success: false,
version,
}
}

try {
const res = await runner.runAndWait({
log,
Expand All @@ -358,8 +347,19 @@ async function runWithoutArtifacts({
version,
}
} catch (err) {
if (err.type === "timeout") {
result = timedOutResult(err.detail.logs)
if (err.type === "out-of-memory" || err.type === "timeout") {
// Command timed out or the pod container exceeded its memory limits
const errorLog =
err.type === "out-of-memory" ? makeOutOfMemoryErrorLog(err.detail.logs) : makeTimeOutErrorLog(err.detail.logs)
result = {
log: errorLog,
moduleName: module.name,
version,
success: false,
startedAt,
completedAt: new Date(),
command: runner.getFullCommand(),
}
} else if (err.type === "pod-runner") {
// Command exited with non-zero code
result = {
Expand Down Expand Up @@ -435,18 +435,6 @@ async function runWithArtifacts({
let result: RunResult
const startedAt = new Date()

const timedOutResult = (logs: string) => {
return {
command: runner.getFullCommand(),
completedAt: new Date(),
log: "Command timed out." + (logs ? ` Here are the logs until the timeout occurred:\n\n${logs}` : ""),
moduleName: module.name,
startedAt,
success: false,
version,
}
}

const timeoutSec = timeout || defaultTimeout

try {
Expand Down Expand Up @@ -543,9 +531,20 @@ async function runWithArtifacts({
} catch (err) {
const res = err.detail.result

if (err.type === "timeout") {
// Command timed out
result = timedOutResult((await runner.getMainContainerLogs()).trim())
if (err.type === "out-of-memory" || err.type === "timeout") {
// Command timed out or the pod container exceeded its memory limits
const containerLogs = (await runner.getMainContainerLogs()).trim()
const errorLog =
err.type === "out-of-memory" ? makeOutOfMemoryErrorLog(containerLogs) : makeTimeOutErrorLog(containerLogs)
result = {
log: errorLog,
moduleName: module.name,
version,
success: false,
startedAt,
completedAt: new Date(),
command: cmd,
}
} else if (err.type === "pod-runner" && res && res.exitCode) {
// Command exited with non-zero code
result = {
Expand Down Expand Up @@ -639,6 +638,18 @@ async function runWithArtifacts({

return result
}
function makeTimeOutErrorLog(containerLogs: string) {
return (
"Command timed out." + (containerLogs ? ` Here are the logs until the timeout occurred:\n\n${containerLogs}` : "")
)
}

function makeOutOfMemoryErrorLog(containerLogs?: string) {
return (
"The Pod container was OOMKilled." +
(containerLogs ? ` Here are the logs until the out-of-memory event occurred:\n\n${containerLogs}` : "")
)
}

class PodRunnerParams {
ctx: PluginContext
Expand Down Expand Up @@ -760,6 +771,17 @@ export class PodRunner extends PodRunnerParams {
const exitReason = terminated?.reason
const exitCode = terminated?.exitCode

// We've seen instances were Pods are OOMKilled but the exit code is 0 and the state that
// Garden computes is "stopped". However in those instances the exitReason is still "OOMKilled"
// and we handle that case specifically here.
if (exitCode === 137 || exitReason === "OOMKilled") {
const msg = `Pod container was OOMKilled.`
throw new OutOfMemoryError(msg, {
logs: (await getDebugLogs()) || msg,
serverPod,
})
}

if (state === "unhealthy") {
if (
exitCode !== undefined &&
Expand Down
167 changes: 165 additions & 2 deletions core/test/integ/src/plugins/kubernetes/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

import td from "testdouble"
import tmp from "tmp-promise"
import { expectError } from "../../../../helpers"
import { pathExists } from "fs-extra"
Expand All @@ -30,15 +31,19 @@ import {
makePodName,
} from "../../../../../src/plugins/kubernetes/util"
import { getContainerTestGarden } from "./container/container"
import { KubernetesPod, KubernetesResource } from "../../../../../src/plugins/kubernetes/types"
import {
KubernetesPod,
KubernetesResource,
KubernetesServerResource,
} from "../../../../../src/plugins/kubernetes/types"
import { PluginContext } from "../../../../../src/plugin-context"
import { LogEntry } from "../../../../../src/logger/log-entry"
import { sleep, StringCollector } from "../../../../../src/util/util"
import { buildHelmModules, getHelmTestGarden } from "./helm/common"
import { getBaseModule, getChartResources } from "../../../../../src/plugins/kubernetes/helm/common"
import { getModuleNamespace } from "../../../../../src/plugins/kubernetes/namespace"
import { GardenModule } from "../../../../../src/types/module"
import { V1Container, V1DaemonSet, V1Deployment, V1StatefulSet } from "@kubernetes/client-node"
import { V1Container, V1DaemonSet, V1Deployment, V1Pod, V1StatefulSet } from "@kubernetes/client-node"

describe("kubernetes Pod runner functions", () => {
let garden: Garden
Expand Down Expand Up @@ -358,6 +363,164 @@ describe("kubernetes Pod runner functions", () => {
)
})

it("should throw if Pod OOMs with exit code 137", async () => {
const mockApi = await KubeApi.factory(garden.log, ctx, provider)
const core = td.replace(mockApi, "core")

const pod = makePod(["sh", "-c", "echo foo"])
pod.spec.containers[0].resources = {
limits: {
memory: "8Mi",
},
}

runner = new PodRunner({
ctx,
pod,
namespace,
api: mockApi,
provider,
})

// We mock the pod status result to fake an OOMKilled event.
// (I tried manually generating an OOM event which worked locally but not on Minkube in CI)
const readNamespacedPodStatusRes: Partial<KubernetesServerResource<V1Pod>> = {
apiVersion: "v1",
kind: "Pod",
metadata: {
name: runner.podName,
namespace: "container-default",
},
spec: {
containers: [
{
command: ["sh", "-c", "echo foo"],
image: "busybox",
imagePullPolicy: "Always",
name: "main",
},
],
},
status: {
conditions: [
{
lastProbeTime: undefined,
lastTransitionTime: new Date(),
status: "True",
type: "PodScheduled",
},
],
containerStatuses: [
{
image: "busybox:latest",
imageID: "docker-pullable://busybox@sha256:some-hash",
lastState: {},
name: "main",
ready: true,
restartCount: 0,
started: true,
state: {
terminated: {
reason: "OOMKilled",
exitCode: 137,
},
},
},
],
phase: "Running",
startTime: new Date(),
},
}
td.when(core.readNamespacedPodStatus(runner.podName, namespace)).thenResolve(readNamespacedPodStatusRes)

await expectError(
() => runner.runAndWait({ log, remove: true, tty: false }),
(err) => {
expect(err.type).to.eql("out-of-memory")
expect(err.message).to.include("OOMKilled")
}
)
})

it("should throw if exit reason is OOMKilled, even if exit code is 0", async () => {
const mockApi = await KubeApi.factory(garden.log, ctx, provider)
const core = td.replace(mockApi, "core")

const pod = makePod(["sh", "-c", "echo foo"])
pod.spec.containers[0].resources = {
limits: {
memory: "8Mi",
},
}

runner = new PodRunner({
ctx,
pod,
namespace,
api: mockApi,
provider,
})

// Here we're specifically testing the case where the exit code is 0 but the exit reason
// is "OOMKilled" which is something we've seen happen "in the wild".
const readNamespacedPodStatusRes: Partial<KubernetesServerResource<V1Pod>> = {
apiVersion: "v1",
kind: "Pod",
metadata: {
name: runner.podName,
namespace: "container-default",
},
spec: {
containers: [
{
command: ["sh", "-c", "echo foo"],
image: "busybox",
imagePullPolicy: "Always",
name: "main",
},
],
},
status: {
conditions: [
{
lastProbeTime: undefined,
lastTransitionTime: new Date(),
status: "True",
type: "PodScheduled",
},
],
containerStatuses: [
{
image: "busybox:latest",
imageID: "docker-pullable://busybox@sha256:some-hash",
lastState: {},
name: "main",
ready: true,
restartCount: 0,
started: true,
state: {
terminated: {
reason: "OOMKilled",
exitCode: 0, // <-----
},
},
},
],
phase: "Running",
startTime: new Date(),
},
}
td.when(core.readNamespacedPodStatus(runner.podName, namespace)).thenResolve(readNamespacedPodStatusRes)

await expectError(
() => runner.runAndWait({ log, remove: true, tty: false }),
(err) => {
expect(err.type).to.eql("out-of-memory")
expect(err.message).to.include("OOMKilled")
}
)
})

context("tty=true", () => {
it("attaches to the process stdio during execution", async () => {
const pod = makePod([
Expand Down

0 comments on commit 9dd044a

Please sign in to comment.