Skip to content

Commit

Permalink
Backport of client: ignore restart issued to terminal allocations int…
Browse files Browse the repository at this point in the history
…o release/1.5.x (#17211)

This pull request was automerged via backport-assistant
  • Loading branch information
hc-github-team-nomad-core authored May 16, 2023
1 parent c09e319 commit ba5bb14
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/17175.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly
```
6 changes: 6 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {

// restartTasks restarts all task runners concurrently.
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {

// ensure we are not trying to restart an alloc that is terminal
if !ar.shouldRun() {
return fmt.Errorf("restart of an alloc that should not run")
}

waitCh := make(chan struct{})
var err *multierror.Error
var errMutex sync.Mutex
Expand Down
61 changes: 61 additions & 0 deletions e2e/clientstate/allocs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package clientstate

import (
"testing"
"time"

"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)

func TestClientAllocs(t *testing.T) {
nomad := e2eutil.NomadClient(t)

e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)

t.Run("testAllocZombie", testAllocZombie)
}

// testAllocZombie ensures that a restart of a dead allocation does not cause
// it to come back to life in a not-quite alive state.
//
// https://github.com/hashicorp/nomad/issues/17079
func testAllocZombie(t *testing.T) {
nomad := e2eutil.NomadClient(t)

jobID := "alloc-zombie-" + uuid.Short()
jobIDs := []string{jobID}
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))

// start the job and wait for alloc to become failed
err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad")
must.NoError(t, err)

allocID := e2eutil.SingleAllocID(t, jobID, "", 0)

// wait for alloc to be marked as failed
e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed")

// wait for additional failures to know we got rescheduled
must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "")
must.NoError(t, err)
return len(statuses) > 2
}),
wait.Timeout(1*time.Minute),
wait.Gap(1*time.Second),
))

// now attempt to restart our initial allocation
// which should do nothing but give us an error
output, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
must.ErrorContains(t, err, "restart of an alloc that should not run")
must.StrContains(t, output, "Failed to restart allocation")
}
59 changes: 59 additions & 0 deletions e2e/clientstate/input/alloc_zombie.nomad
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: MPL-2.0

job "alloc_zombie" {

group "group" {
network {
mode = "host"
port "http" {}
}

service {
name = "alloczombie"
port = "http"
provider = "nomad"

check {
name = "alloczombiecheck"
type = "http"
port = "http"
path = "/does/not/exist.txt"
interval = "2s"
timeout = "1s"
check_restart {
limit = 1
grace = "3s"
}
}
}

reschedule {
attempts = 3
interval = "1m"
delay = "5s"
delay_function = "constant"
unlimited = false
}

restart {
attempts = 0
delay = "5s"
mode = "fail"
}

task "python" {
driver = "raw_exec"

config {
command = "python3"
args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"]
}

resources {
cpu = 10
memory = 64
}
}
}
}

0 comments on commit ba5bb14

Please sign in to comment.