From 94dc4afe4362621757d8103638f96fdfe34b56e0 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 24 Dec 2019 05:22:20 -0500 Subject: [PATCH] #14 oom killer: cleaner logging and unit test --- driver_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- go.sum | 2 ++ handle.go | 9 +++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/driver_test.go b/driver_test.go index 83ae38c5..77bf17de 100644 --- a/driver_test.go +++ b/driver_test.go @@ -52,7 +52,7 @@ func createBasicResources() *drivers.Resources { res := drivers.Resources{ NomadResources: &structs.AllocatedTaskResources{ Memory: structs.AllocatedMemoryResources{ - // MemoryMB: 256, + MemoryMB: 100, }, Cpu: structs.AllocatedCpuResources{ CpuShares: 250, @@ -630,6 +630,54 @@ func TestPodmanDriver_Init(t *testing.T) { } +// test oom flag propagation +func TestPodmanDriver_OOM(t *testing.T) { + if !tu.IsCI() { + t.Parallel() + } + + taskCfg := newTaskConfig("", []string{ + // Incrementally creates a bigger and bigger variable. + "sh", + "-c", + "x=a; while true; do eval x='$x$x'; done", + }) + // enable --init + taskCfg.Init = true + + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + Name: "oom", + AllocID: uuid.Generate(), + Resources: createBasicResources(), + } + // limit memory to 10MB to trigger oom soon enough + task.Resources.NomadResources.Memory.MemoryMB = 10 + require.NoError(t, task.EncodeConcreteDriverConfig(&taskCfg)) + + d := podmanDriverHarness(t, nil) + cleanup := d.MkAllocDir(task, true) + defer cleanup() + + _, _, err := d.StartTask(task) + require.NoError(t, err) + + defer d.DestroyTask(task.ID, true) + + // Attempt to wait + waitCh, err := d.WaitTask(context.Background(), task.ID) + require.NoError(t, err) + + select { + case res := <-waitCh: + require.False(t, res.Successful(), "Should have failed because of oom but was successful") + require.True(t, res.OOMKilled, "OOM Flag not set") + require.Contains(t, res.Err.Error(), "OOM killer") + case <-time.After(time.Duration(tu.TestMultiplier()*1) * time.Second): + t.Fatalf("Container did not exit in time") + } +} + // read a tasks logfile into a string, fail on error func readLogfile(t *testing.T, task *drivers.TaskConfig) string { logfile := filepath.Join(filepath.Dir(task.StdoutPath), fmt.Sprintf("%s.stdout.0", task.Name)) diff --git a/go.sum b/go.sum index b4caec77..f1733326 100644 --- a/go.sum +++ b/go.sum @@ -323,6 +323,7 @@ github.com/gorhill/cronexpr v0.0.0-20180427100037-88b0669f7d75 h1:f0n1xnMSmBLzVf github.com/gorhill/cronexpr v0.0.0-20180427100037-88b0669f7d75/go.mod h1:g2644b03hfBX9Ov0ZBDgXXens4rxSxmqFBbhvKv2yVA= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= +github.com/gorilla/mux v1.7.3 h1:gnP5JzjVOuiZD07fKKToCAOjS0yOpj/qPETTXCCS6hw= github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= @@ -575,6 +576,7 @@ github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFB github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.0.0/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= diff --git a/handle.go b/handle.go index 9866db7a..eb8f14cb 100644 --- a/handle.go +++ b/handle.go @@ -167,13 +167,18 @@ func (h *TaskHandle) MonitorContainer() { h.stateLock.Lock() h.procState = drivers.TaskStateExited if err != nil { - h.logger.Error("Failt to inspect stopped container, can not get exit code", "err", err) + h.exitResult.Err = fmt.Errorf("Driver was unable to get the exit code. %s: %v", h.containerID, err) + h.logger.Error("Failt to inspect stopped container, can not get exit code", "container", h.containerID, "err", err) h.exitResult.Signal = 0 h.completedAt = time.Now() } else { h.exitResult.ExitCode = int(inspectData.State.ExitCode) - h.exitResult.OOMKilled = inspectData.State.OOMKilled h.completedAt = inspectData.State.FinishedAt + if inspectData.State.OOMKilled { + h.exitResult.OOMKilled = true + h.exitResult.Err = fmt.Errorf("Podman container killed by OOM killer") + h.logger.Error("Podman container killed by OOM killer", "container", h.containerID) + } } if h.exitChannel != nil { h.exitChannel <- h.exitResult