From 70587d62ca224c331548ffcc9dac8ad345bf7860 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Thu, 7 Jul 2022 14:52:45 -0400 Subject: [PATCH] backport of commit dbcccc7a681e5495d3713a4b4b953218eed83532 (#13642) This pull request was automerged via backport-assistant --- .changelog/13626.txt | 3 +++ client/allocrunner/alloc_runner.go | 6 +++--- client/allocrunner/alloc_runner_test.go | 2 +- client/allocrunner/taskrunner/driver_handle.go | 10 ++++++++-- client/allocrunner/taskrunner/remotetask_hook.go | 2 +- client/allocrunner/taskrunner/task_runner.go | 4 ++-- client/config/testing.go | 5 +++++ nomad/structs/structs.go | 9 +++++---- nomad/structs/structs_test.go | 3 ++- 9 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 .changelog/13626.txt diff --git a/.changelog/13626.txt b/.changelog/13626.txt new file mode 100644 index 00000000000..9c6f8670139 --- /dev/null +++ b/.changelog/13626.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where max_kill_timeout client config was ignored +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 59199c0eaff..598ebc629fa 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -613,7 +613,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { } taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) @@ -636,7 +636,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping task", "error", err, "task_name", name) @@ -660,7 +660,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 0b21b78150d..b0a8cceb451 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -125,7 +125,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" if killingMsg != expectedKillingMsg { - return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) + return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg) } // Task Two should be dead diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go index 36427f6f245..73a376c8a0e 100644 --- a/client/allocrunner/taskrunner/driver_handle.go +++ b/client/allocrunner/taskrunner/driver_handle.go @@ -6,18 +6,24 @@ import ( "time" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" ) // NewDriverHandle returns a handle for task operations on a specific task -func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle { +func NewDriverHandle( + driver drivers.DriverPlugin, + taskID string, + task *structs.Task, + maxKillTimeout time.Duration, + net *drivers.DriverNetwork) *DriverHandle { return &DriverHandle{ driver: driver, net: net, taskID: taskID, killSignal: task.KillSignal, - killTimeout: task.KillTimeout, + killTimeout: helper.Min(task.KillTimeout, maxKillTimeout), } } diff --git a/client/allocrunner/taskrunner/remotetask_hook.go b/client/allocrunner/taskrunner/remotetask_hook.go index 2068b52d9bf..5a8ac03d12a 100644 --- a/client/allocrunner/taskrunner/remotetask_hook.go +++ b/client/allocrunner/taskrunner/remotetask_hook.go @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest return nil } - h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride)) + h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride)) h.tr.stateLock.Lock() h.tr.localState.TaskHandle = th diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 99792315ce7..cc6dd387a03 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -871,7 +871,7 @@ func (tr *TaskRunner) runDriver() error { } tr.stateLock.Unlock() - tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) // Emit an event that we started tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) @@ -1170,7 +1170,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers } // Update driver handle on task runner - tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) return true } diff --git a/client/config/testing.go b/client/config/testing.go index 642cafdf7da..bbf5a0173e0 100644 --- a/client/config/testing.go +++ b/client/config/testing.go @@ -4,6 +4,7 @@ import ( "io/ioutil" "os" "path/filepath" + "time" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/testlog" @@ -57,5 +58,9 @@ func TestClientConfig(t testing.T) (*Config, func()) { // Loosen GC threshold conf.GCDiskUsageThreshold = 98.0 conf.GCInodeUsageThreshold = 98.0 + + // Same as default; necessary for task Event messages + conf.MaxKillTimeout = 30 * time.Second + return conf, cleanup } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 7201a700ec3..ebe8b88539d 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6823,7 +6823,7 @@ type Task struct { // task exits, other tasks will be gracefully terminated. Leader bool - // ShutdownDelay is the duration of the delay between deregistering a + // ShutdownDelay is the duration of the delay between de-registering a // task from Consul and sending it a signal to shutdown. See #2441 ShutdownDelay time.Duration @@ -8221,9 +8221,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent { return e } -func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent { - e.KillTimeout = timeout - e.Details["kill_timeout"] = timeout.String() +func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent { + actual := helper.Min(timeout, maxTimeout) + e.KillTimeout = actual + e.Details["kill_timeout"] = actual.String() return e } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index b7a1e7a017e..b5e9ff9ba63 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -5715,7 +5715,8 @@ func TestTaskEventPopulate(t *testing.T) { {NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"}, {NewTaskEvent(TaskKilling), "Sent interrupt"}, {NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"}, - {NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"}, {NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"}, {NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""}, {NewTaskEvent(TaskKilled), "Task successfully killed"},