From 509a6bfe34c114f7705f02d8cdf78182088c131a Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 15 May 2023 14:27:58 +0000 Subject: [PATCH] e2e: add e2e test for alloc restart zombies --- e2e/clientstate/allocs_test.go | 61 ++++++++++++++++++++++++ e2e/clientstate/input/alloc_zombie.nomad | 59 +++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 e2e/clientstate/allocs_test.go create mode 100644 e2e/clientstate/input/alloc_zombie.nomad diff --git a/e2e/clientstate/allocs_test.go b/e2e/clientstate/allocs_test.go new file mode 100644 index 00000000000..b4f963ea681 --- /dev/null +++ b/e2e/clientstate/allocs_test.go @@ -0,0 +1,61 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package clientstate + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +func TestClientAllocs(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + e2eutil.WaitForLeader(t, nomad) + e2eutil.WaitForNodesReady(t, nomad, 1) + + t.Run("testAllocZombie", testAllocZombie) +} + +// testAllocZombie ensures that a restart of a dead allocation does not cause +// it to come back to life in a not-quite alive state. +// +// https://github.com/hashicorp/nomad/issues/17079 +func testAllocZombie(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + jobID := "alloc-zombie-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // start the job and wait for alloc to become failed + err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad") + must.NoError(t, err) + + allocID := e2eutil.SingleAllocID(t, jobID, "", 0) + + // wait for alloc to be marked as failed + e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed") + + // wait for additional failures to know we got rescheduled + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "") + must.NoError(t, err) + return len(statuses) > 2 + }), + wait.Timeout(1*time.Minute), + wait.Gap(1*time.Second), + )) + + // now attempt to restart our initial allocation + // which should do nothing but give us an error + output, err := e2eutil.Command("nomad", "alloc", "restart", allocID) + must.ErrorContains(t, err, "restart of an alloc that should not run") + must.StrContains(t, output, "Failed to restart allocation") +} diff --git a/e2e/clientstate/input/alloc_zombie.nomad b/e2e/clientstate/input/alloc_zombie.nomad new file mode 100644 index 00000000000..ac8ecdad1d2 --- /dev/null +++ b/e2e/clientstate/input/alloc_zombie.nomad @@ -0,0 +1,59 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: MPL-2.0 + +job "alloc_zombie" { + + group "group" { + network { + mode = "host" + port "http" {} + } + + service { + name = "alloczombie" + port = "http" + provider = "nomad" + + check { + name = "alloczombiecheck" + type = "http" + port = "http" + path = "/does/not/exist.txt" + interval = "2s" + timeout = "1s" + check_restart { + limit = 1 + grace = "3s" + } + } + } + + reschedule { + attempts = 3 + interval = "1m" + delay = "5s" + delay_function = "constant" + unlimited = false + } + + restart { + attempts = 0 + delay = "5s" + mode = "fail" + } + + task "python" { + driver = "raw_exec" + + config { + command = "python3" + args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"] + } + + resources { + cpu = 10 + memory = 64 + } + } + } +}