From dbcccc7a681e5495d3713a4b4b953218eed83532 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 6 Jul 2022 14:44:58 -0500 Subject: [PATCH] client: enforce max_kill_timeout client configuration This PR fixes a bug where client configuration max_kill_timeout was not being enforced. The feature was introduced in 9f44780 but seems to have been removed during the major drivers refactoring. We can make sure the value is enforced by pluming it through the DriverHandler, which now uses the lesser of the task.killTimeout or client.maxKillTimeout. Also updates Event.SetKillTimeout to require both the task.killTimeout and client.maxKillTimeout so that we don't make the mistake of using the wrong value - as it was being given only the task.killTimeout before. --- .changelog/13626.txt | 3 +++ client/allocrunner/alloc_runner.go | 6 +++--- client/allocrunner/alloc_runner_test.go | 2 +- client/allocrunner/taskrunner/driver_handle.go | 10 ++++++++-- client/allocrunner/taskrunner/remotetask_hook.go | 2 +- client/allocrunner/taskrunner/task_runner.go | 4 ++-- client/config/testing.go | 5 +++++ nomad/structs/structs.go | 9 +++++---- nomad/structs/structs_test.go | 3 ++- 9 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 .changelog/13626.txt diff --git a/.changelog/13626.txt b/.changelog/13626.txt new file mode 100644 index 00000000000..9c6f8670139 --- /dev/null +++ b/.changelog/13626.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where max_kill_timeout client config was ignored +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 5ef9e5743f2..f8eaf8aa86a 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { } taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping task", "error", err, "task_name", name) @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index b28c87dd4f5..fafb335739d 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" if killingMsg != expectedKillingMsg { - return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) + return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg) } // Task Two should be dead diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go index 36427f6f245..73a376c8a0e 100644 --- a/client/allocrunner/taskrunner/driver_handle.go +++ b/client/allocrunner/taskrunner/driver_handle.go @@ -6,18 +6,24 @@ import ( "time" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" ) // NewDriverHandle returns a handle for task operations on a specific task -func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle { +func NewDriverHandle( + driver drivers.DriverPlugin, + taskID string, + task *structs.Task, + maxKillTimeout time.Duration, + net *drivers.DriverNetwork) *DriverHandle { return &DriverHandle{ driver: driver, net: net, taskID: taskID, killSignal: task.KillSignal, - killTimeout: task.KillTimeout, + killTimeout: helper.Min(task.KillTimeout, maxKillTimeout), } } diff --git a/client/allocrunner/taskrunner/remotetask_hook.go b/client/allocrunner/taskrunner/remotetask_hook.go index 2068b52d9bf..5a8ac03d12a 100644 --- a/client/allocrunner/taskrunner/remotetask_hook.go +++ b/client/allocrunner/taskrunner/remotetask_hook.go @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest return nil } - h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride)) + h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride)) h.tr.stateLock.Lock() h.tr.localState.TaskHandle = th diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index f5342962b49..95c87440eec 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error { } tr.stateLock.Unlock() - tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) // Emit an event that we started tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers } // Update driver handle on task runner - tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) return true } diff --git a/client/config/testing.go b/client/config/testing.go index a326cfbd6eb..0204073fb57 100644 --- a/client/config/testing.go +++ b/client/config/testing.go @@ -4,6 +4,7 @@ import ( "io/ioutil" "os" "path/filepath" + "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) { // Loosen GC threshold conf.GCDiskUsageThreshold = 98.0 conf.GCInodeUsageThreshold = 98.0 + + // Same as default; necessary for task Event messages + conf.MaxKillTimeout = 30 * time.Second + return conf, cleanup } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1304ddd4396..d7748d95ef4 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6976,7 +6976,7 @@ type Task struct { // task exits, other tasks will be gracefully terminated. Leader bool - // ShutdownDelay is the duration of the delay between deregistering a + // ShutdownDelay is the duration of the delay between de-registering a // task from Consul and sending it a signal to shutdown. See #2441 ShutdownDelay time.Duration @@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent { return e } -func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent { - e.KillTimeout = timeout - e.Details["kill_timeout"] = timeout.String() +func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent { + actual := helper.Min(timeout, maxTimeout) + e.KillTimeout = actual + e.Details["kill_timeout"] = actual.String() return e } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index cb155d433cd..3765016706c 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) { {NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"}, {NewTaskEvent(TaskKilling), "Sent interrupt"}, {NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"}, - {NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"}, {NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"}, {NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""}, {NewTaskEvent(TaskKilled), "Task successfully killed"},