From 8e9cb77659600bc7bb6dc766fe0d687e4476ad05 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 24 Sep 2024 11:25:10 -0400 Subject: [PATCH] template: fix panic in change_mode=script on client restart When we introduced change_mode=script to templates, we passed the driver handle down into the template manager so we could call its `Exec` method directly. But the lifecycle of the driver handle is managed by the taskrunner and isn't available when the template manager is first created. This has led to a series of patches trying to fixup the behavior (#15915, #15192, #23663, #23917). Part of the challenge in getting this right is using an interface to avoid the circular import of the driver handle. But the taskrunner already has a way to deal with this problem using a "lazy handle". The other template change modes already use this indirectly through the `Lifecycle` interface. Change the driver handle `Exec` call in the template manager to a new `Lifecycle.Exec` call that reuses the existing behavior. This eliminates the need for the template manager to know anything at all about the handle state. Fixes: https://github.com/hashicorp/nomad/issues/24051 --- .changelog/24057.txt | 3 ++ .../allocrunner/interfaces/task_lifecycle.go | 1 + .../allocrunner/taskrunner/driver_handle.go | 6 +++ .../taskrunner/interfaces/lifecycle.go | 4 ++ client/allocrunner/taskrunner/lifecycle.go | 13 ++++++ .../taskrunner/task_runner_hooks.go | 1 - .../taskrunner/template/template.go | 32 ++------------ .../taskrunner/template/template_test.go | 44 ++----------------- .../allocrunner/taskrunner/template_hook.go | 39 ---------------- .../taskrunner/template_hook_test.go | 16 +++---- .../allocrunner/taskrunner/testing/testing.go | 19 ++++++++ 11 files changed, 59 insertions(+), 119 deletions(-) create mode 100644 .changelog/24057.txt diff --git a/.changelog/24057.txt b/.changelog/24057.txt new file mode 100644 index 00000000000..35edef84966 --- /dev/null +++ b/.changelog/24057.txt @@ -0,0 +1,3 @@ +```release-note:bug +template: Fixed a panic on client restart when using change_mode=script +``` diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go index 3c3245bbeea..3a765480dc4 100644 --- a/client/allocrunner/interfaces/task_lifecycle.go +++ b/client/allocrunner/interfaces/task_lifecycle.go @@ -129,6 +129,7 @@ type TaskPoststartRequest struct { // Stats collector DriverStats DriverStats } + type TaskPoststartResponse struct{} type TaskPoststartHook interface { diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go index aefd04bfb7f..73c391b007b 100644 --- a/client/allocrunner/taskrunner/driver_handle.go +++ b/client/allocrunner/taskrunner/driver_handle.go @@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error { // Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec. func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { + if h == nil { + return nil, 0, ErrTaskNotRunning + } command := append([]string{cmd}, args...) res, err := h.driver.ExecTask(h.taskID, command, timeout) if err != nil { @@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context, command []string, tty bool, stream drivers.ExecTaskStream) error { + if h == nil { + return ErrTaskNotRunning + } if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok { return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream) diff --git a/client/allocrunner/taskrunner/interfaces/lifecycle.go b/client/allocrunner/taskrunner/interfaces/lifecycle.go index f89b6dc5a6f..20c57d14ddf 100644 --- a/client/allocrunner/taskrunner/interfaces/lifecycle.go +++ b/client/allocrunner/taskrunner/interfaces/lifecycle.go @@ -5,6 +5,7 @@ package interfaces import ( "context" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -20,6 +21,9 @@ type TaskLifecycle interface { // Kill a task permanently. Kill(ctx context.Context, event *structs.TaskEvent) error + // Exec into a running task. + Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) + // IsRunning returns true if the task runner has a handle to the task // driver, which is useful for distinguishing restored tasks during // prestart hooks. But note that the driver handle could go away after you diff --git a/client/allocrunner/taskrunner/lifecycle.go b/client/allocrunner/taskrunner/lifecycle.go index e1fdb413786..7e412f98e67 100644 --- a/client/allocrunner/taskrunner/lifecycle.go +++ b/client/allocrunner/taskrunner/lifecycle.go @@ -5,6 +5,7 @@ package taskrunner import ( "context" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent, return nil } +func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { + tr.logger.Trace("Exec requested") + + handle := tr.getDriverHandle() + if handle == nil { + return nil, 0, ErrTaskNotRunning + } + + out, code, err := handle.Exec(timeout, cmd, args) + return out, code, err +} + func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error { tr.logger.Trace("Signal requested", "signal", s) diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index 5b5f64708b3..b0fdcea1a4d 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() { consulNamespace: consulNamespace, nomadNamespace: tr.alloc.Job.Namespace, renderOnTaskRestart: task.RestartPolicy.RenderTemplates, - driverHandle: tr.handle, })) } diff --git a/client/allocrunner/taskrunner/template/template.go b/client/allocrunner/taskrunner/template/template.go index e5fcc6dbdea..ca99b01a652 100644 --- a/client/allocrunner/taskrunner/template/template.go +++ b/client/allocrunner/taskrunner/template/template.go @@ -60,10 +60,6 @@ type TaskTemplateManager struct { // runner is the consul-template runner runner *manager.Runner - // handle is used to execute scripts - handle interfaces.ScriptExecutor - handleLock sync.Mutex - // signals is a lookup map from the string representation of a signal to its // actual signal signals map[string]os.Signal @@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() { } } -// SetDriverHandle sets the executor -func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) { - tm.handleLock.Lock() - defer tm.handleLock.Unlock() - tm.handle = executor - -} - // run is the long lived loop that handles errors and templates being rendered func (tm *TaskTemplateManager) run() { // Runner is nil if there are no templates @@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) { defer wg.Done() - tm.handleLock.Lock() - defer tm.handleLock.Unlock() - if tm.handle == nil { - failureMsg := fmt.Sprintf( - "Template failed to run script %v with arguments %v because task driver handle is not available", - script.Command, - script.Args, - ) - tm.handleScriptError(script, failureMsg) - return - } - _, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args) + _, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args) if err != nil { failureMsg := fmt.Sprintf( - "Template failed to run script %v with arguments %v on change: %v Exit code: %v", + "Template failed to run script %v with arguments %v on change: %v. Exit code: %v", script.Command, script.Args, err, @@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s } if exitCode != 0 { failureMsg := fmt.Sprintf( - "Template ran script %v with arguments %v on change but it exited with code code: %v", + "Template ran script %v with arguments %v on change but it exited with code: %v", script.Command, script.Args, exitCode, @@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage). SetDisplayMessage( fmt.Sprintf( - "Template successfully ran script %v with arguments: %v. Exit code: %v", + "Template successfully ran script %v with arguments: %v. Exit code: 0", script.Command, script.Args, - exitCode, ))) } diff --git a/client/allocrunner/taskrunner/template/template_test.go b/client/allocrunner/taskrunner/template/template_test.go index 66d7e8b4c21..4811e364a5b 100644 --- a/client/allocrunner/taskrunner/template/template_test.go +++ b/client/allocrunner/taskrunner/template/template_test.go @@ -59,16 +59,6 @@ const ( TestTaskName = "test-task" ) -// mockExecutor implements script executor interface -type mockExecutor struct { - DesiredExit int - DesiredErr error -} - -func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { - return []byte{}, m.DesiredExit, m.DesiredErr -} - // testHarness is used to test the TaskTemplateManager by spinning up // Consul/Vault as needed type testHarness struct { @@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) { key2 := "bar" content1_1 := "cat" content1_2 := "dog" - content1_3 := "goldfish" + t1 := &structs.Template{ EmbeddedTmpl: ` FOO={{key "bam"}} @@ -1176,10 +1166,9 @@ BAR={{key "bar"}} Envvars: true, } - me := mockExecutor{DesiredExit: 0, DesiredErr: nil} harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false) + harness.mockHooks.SetupExecTest(0, nil) harness.start(t) - harness.manager.SetDriverHandle(&me) defer harness.stop() // Ensure no unblock @@ -1220,29 +1209,6 @@ OUTER: t.Fatal(t, "should have received an event") } } - - // remove the reference to the task handle and update the template contents - // again - harness.manager.SetDriverHandle(nil) - harness.consul.SetKV(t, key1, []byte(content1_3)) - timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second) - -OUTER2: - for { - select { - case <-harness.mockHooks.RestartCh: - t.Fatal(t, "restart not expected") - case ev := <-harness.mockHooks.EmitEventCh: - if strings.Contains( - ev.DisplayMessage, "task driver handle is not available") { - break OUTER2 - } - case <-harness.mockHooks.SignalCh: - t.Fatal(t, "signal not expected") - case <-timeout: - t.Fatal(t, "should have received an event that task driver handle is unavailable") - } - } } // TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the @@ -1285,10 +1251,9 @@ BAR={{key "bar"}} Envvars: true, } - me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")} harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false) + harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed")) harness.start(t) - harness.manager.SetDriverHandle(&me) defer harness.stop() // Ensure no unblock @@ -1365,10 +1330,9 @@ COMMON={{key "common"}} templateScript, } - me := mockExecutor{DesiredExit: 0, DesiredErr: nil} harness := newTestHarness(t, templates, true, false) + harness.mockHooks.SetupExecTest(0, nil) harness.start(t) - harness.manager.SetDriverHandle(&me) defer harness.stop() // Ensure no unblock diff --git a/client/allocrunner/taskrunner/template_hook.go b/client/allocrunner/taskrunner/template_hook.go index c61a8503261..51048e96150 100644 --- a/client/allocrunner/taskrunner/template_hook.go +++ b/client/allocrunner/taskrunner/template_hook.go @@ -55,11 +55,6 @@ type templateHookConfig struct { // hookResources are used to fetch Consul tokens hookResources *cstructs.AllocHookResources - - // driverHandle is the task driver executor used to run scripts when the - // template change mode is set to script. Typically this will be nil in this - // config struct, unless we're restoring a task after a client restart. - driverHandle ti.ScriptExecutor } type templateHook struct { @@ -72,15 +67,6 @@ type templateHook struct { templateManager *template.TaskTemplateManager managerLock sync.Mutex - // driverHandle is the task driver executor used by the template manager to - // run scripts when the template change mode is set to script. This value is - // set in the Poststart hook after the task has run, or passed in as - // configuration if this is a task that's being restored after a client - // restart. - // - // Must obtain a managerLock before changing. It may be nil. - driverHandle ti.ScriptExecutor - // consulNamespace is the current Consul namespace consulNamespace string @@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook { config: config, consulNamespace: config.consulNamespace, logger: config.logger.Named(templateHookName), - driverHandle: config.driverHandle, } } @@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar return nil } -func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error { - h.managerLock.Lock() - defer h.managerLock.Unlock() - - if h.templateManager == nil { - return nil - } - - if req.DriverExec != nil { - h.driverHandle = req.DriverExec - h.templateManager.SetDriverHandle(h.driverHandle) - } else { - for _, tmpl := range h.config.templates { - if tmpl.ChangeMode == structs.TemplateChangeModeScript { - return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available") - } - } - } - return nil -} - func (h *templateHook) newManager() (unblock chan struct{}, err error) { unblock = make(chan struct{}) @@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) { } h.templateManager = m - if h.driverHandle != nil { - h.templateManager.SetDriverHandle(h.driverHandle) - } return unblock, nil } diff --git a/client/allocrunner/taskrunner/template_hook_test.go b/client/allocrunner/taskrunner/template_hook_test.go index 94f6c7e3c63..056c6a00193 100644 --- a/client/allocrunner/taskrunner/template_hook_test.go +++ b/client/allocrunner/taskrunner/template_hook_test.go @@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) { hookResources: tt.hr, } h := &templateHook{ - config: conf, - logger: logger, - managerLock: sync.Mutex{}, - driverHandle: nil, + config: conf, + logger: logger, + managerLock: sync.Mutex{}, } req := &interfaces.TaskPrestartRequest{ Alloc: a, @@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) { envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region) lifecycle := trtesting.NewMockTaskHooks() + lifecycle.SetupExecTest(117, fmt.Errorf("oh no")) lifecycle.HasHandle = true events := &trtesting.MockEmitter{} - executor := &simpleExec{ - code: 117, - err: fmt.Errorf("oh no"), - } - hook := newTemplateHook(&templateHookConfig{ alloc: alloc, logger: logger, @@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) { clientConfig: clientConfig, envBuilder: envBuilder, hookResources: &cstructs.AllocHookResources{}, - driverHandle: executor, }) req := &interfaces.TaskPrestartRequest{ Alloc: alloc, @@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) { gotEvents := events.Events() must.Len(t, 1, gotEvents) must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type) - must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117", + must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117", gotEvents[0].DisplayMessage) } diff --git a/client/allocrunner/taskrunner/testing/testing.go b/client/allocrunner/taskrunner/testing/testing.go index 3760d77982f..fa32bc1c328 100644 --- a/client/allocrunner/taskrunner/testing/testing.go +++ b/client/allocrunner/taskrunner/testing/testing.go @@ -6,6 +6,7 @@ package testing import ( "context" "sync" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -49,6 +50,9 @@ type MockTaskHooks struct { EmitEventCh chan *structs.TaskEvent events []*structs.TaskEvent + execCode int + execErr error + // HasHandle can be set to simulate restoring a task after client restart HasHandle bool } @@ -105,6 +109,21 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro return nil } +func (m *MockTaskHooks) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { + m.lock.Lock() + defer m.lock.Unlock() + + return []byte{}, m.execCode, m.execErr +} + +func (m *MockTaskHooks) SetupExecTest(code int, err error) { + m.lock.Lock() + defer m.lock.Unlock() + + m.execCode = code + m.execErr = err +} + func (m *MockTaskHooks) IsRunning() bool { return m.HasHandle }