From 8e9cb77659600bc7bb6dc766fe0d687e4476ad05 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 24 Sep 2024 11:25:10 -0400
Subject: [PATCH] template: fix panic in change_mode=script on client restart

When we introduced change_mode=script to templates, we passed the driver handle
down into the template manager so we could call its `Exec` method directly. But
the lifecycle of the driver handle is managed by the taskrunner and isn't
available when the template manager is first created. This has led to a series
of patches trying to fixup the behavior (#15915, #15192, #23663, #23917). Part
of the challenge in getting this right is using an interface to avoid the
circular import of the driver handle.

But the taskrunner already has a way to deal with this problem using a "lazy
handle". The other template change modes already use this indirectly through the
`Lifecycle` interface. Change the driver handle `Exec` call in the template
manager to a new `Lifecycle.Exec` call that reuses the existing behavior. This
eliminates the need for the template manager to know anything at all about the
handle state.

Fixes: https://github.com/hashicorp/nomad/issues/24051
---
 .changelog/24057.txt                          |  3 ++
 .../allocrunner/interfaces/task_lifecycle.go  |  1 +
 .../allocrunner/taskrunner/driver_handle.go   |  6 +++
 .../taskrunner/interfaces/lifecycle.go        |  4 ++
 client/allocrunner/taskrunner/lifecycle.go    | 13 ++++++
 .../taskrunner/task_runner_hooks.go           |  1 -
 .../taskrunner/template/template.go           | 32 ++------------
 .../taskrunner/template/template_test.go      | 44 ++-----------------
 .../allocrunner/taskrunner/template_hook.go   | 39 ----------------
 .../taskrunner/template_hook_test.go          | 16 +++----
 .../allocrunner/taskrunner/testing/testing.go | 19 ++++++++
 11 files changed, 59 insertions(+), 119 deletions(-)
 create mode 100644 .changelog/24057.txt

diff --git a/.changelog/24057.txt b/.changelog/24057.txt
new file mode 100644
index 00000000000..35edef84966
--- /dev/null
+++ b/.changelog/24057.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+template: Fixed a panic on client restart when using change_mode=script
+```
diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go
index 3c3245bbeea..3a765480dc4 100644
--- a/client/allocrunner/interfaces/task_lifecycle.go
+++ b/client/allocrunner/interfaces/task_lifecycle.go
@@ -129,6 +129,7 @@ type TaskPoststartRequest struct {
 	// Stats collector
 	DriverStats DriverStats
 }
+
 type TaskPoststartResponse struct{}
 
 type TaskPoststartHook interface {
diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go
index aefd04bfb7f..73c391b007b 100644
--- a/client/allocrunner/taskrunner/driver_handle.go
+++ b/client/allocrunner/taskrunner/driver_handle.go
@@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error {
 
 // Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec.
 func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
+	if h == nil {
+		return nil, 0, ErrTaskNotRunning
+	}
 	command := append([]string{cmd}, args...)
 	res, err := h.driver.ExecTask(h.taskID, command, timeout)
 	if err != nil {
@@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context,
 	command []string,
 	tty bool,
 	stream drivers.ExecTaskStream) error {
+	if h == nil {
+		return ErrTaskNotRunning
+	}
 
 	if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok {
 		return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream)
diff --git a/client/allocrunner/taskrunner/interfaces/lifecycle.go b/client/allocrunner/taskrunner/interfaces/lifecycle.go
index f89b6dc5a6f..20c57d14ddf 100644
--- a/client/allocrunner/taskrunner/interfaces/lifecycle.go
+++ b/client/allocrunner/taskrunner/interfaces/lifecycle.go
@@ -5,6 +5,7 @@ package interfaces
 
 import (
 	"context"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/structs"
 )
@@ -20,6 +21,9 @@ type TaskLifecycle interface {
 	// Kill a task permanently.
 	Kill(ctx context.Context, event *structs.TaskEvent) error
 
+	// Exec into a running task.
+	Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error)
+
 	// IsRunning returns true if the task runner has a handle to the task
 	// driver, which is useful for distinguishing restored tasks during
 	// prestart hooks. But note that the driver handle could go away after you
diff --git a/client/allocrunner/taskrunner/lifecycle.go b/client/allocrunner/taskrunner/lifecycle.go
index e1fdb413786..7e412f98e67 100644
--- a/client/allocrunner/taskrunner/lifecycle.go
+++ b/client/allocrunner/taskrunner/lifecycle.go
@@ -5,6 +5,7 @@ package taskrunner
 
 import (
 	"context"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/structs"
 )
@@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent,
 	return nil
 }
 
+func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
+	tr.logger.Trace("Exec requested")
+
+	handle := tr.getDriverHandle()
+	if handle == nil {
+		return nil, 0, ErrTaskNotRunning
+	}
+
+	out, code, err := handle.Exec(timeout, cmd, args)
+	return out, code, err
+}
+
 func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
 	tr.logger.Trace("Signal requested", "signal", s)
 
diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go
index 5b5f64708b3..b0fdcea1a4d 100644
--- a/client/allocrunner/taskrunner/task_runner_hooks.go
+++ b/client/allocrunner/taskrunner/task_runner_hooks.go
@@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() {
 			consulNamespace:     consulNamespace,
 			nomadNamespace:      tr.alloc.Job.Namespace,
 			renderOnTaskRestart: task.RestartPolicy.RenderTemplates,
-			driverHandle:        tr.handle,
 		}))
 	}
 
diff --git a/client/allocrunner/taskrunner/template/template.go b/client/allocrunner/taskrunner/template/template.go
index e5fcc6dbdea..ca99b01a652 100644
--- a/client/allocrunner/taskrunner/template/template.go
+++ b/client/allocrunner/taskrunner/template/template.go
@@ -60,10 +60,6 @@ type TaskTemplateManager struct {
 	// runner is the consul-template runner
 	runner *manager.Runner
 
-	// handle is used to execute scripts
-	handle     interfaces.ScriptExecutor
-	handleLock sync.Mutex
-
 	// signals is a lookup map from the string representation of a signal to its
 	// actual signal
 	signals map[string]os.Signal
@@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() {
 	}
 }
 
-// SetDriverHandle sets the executor
-func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) {
-	tm.handleLock.Lock()
-	defer tm.handleLock.Unlock()
-	tm.handle = executor
-
-}
-
 // run is the long lived loop that handles errors and templates being rendered
 func (tm *TaskTemplateManager) run() {
 	// Runner is nil if there are no templates
@@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m
 func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) {
 	defer wg.Done()
 
-	tm.handleLock.Lock()
-	defer tm.handleLock.Unlock()
-	if tm.handle == nil {
-		failureMsg := fmt.Sprintf(
-			"Template failed to run script %v with arguments %v because task driver handle is not available",
-			script.Command,
-			script.Args,
-		)
-		tm.handleScriptError(script, failureMsg)
-		return
-	}
-	_, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args)
+	_, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args)
 	if err != nil {
 		failureMsg := fmt.Sprintf(
-			"Template failed to run script %v with arguments %v on change: %v Exit code: %v",
+			"Template failed to run script %v with arguments %v on change: %v. Exit code: %v",
 			script.Command,
 			script.Args,
 			err,
@@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
 	}
 	if exitCode != 0 {
 		failureMsg := fmt.Sprintf(
-			"Template ran script %v with arguments %v on change but it exited with code code: %v",
+			"Template ran script %v with arguments %v on change but it exited with code: %v",
 			script.Command,
 			script.Args,
 			exitCode,
@@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
 	tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage).
 		SetDisplayMessage(
 			fmt.Sprintf(
-				"Template successfully ran script %v with arguments: %v. Exit code: %v",
+				"Template successfully ran script %v with arguments: %v. Exit code: 0",
 				script.Command,
 				script.Args,
-				exitCode,
 			)))
 }
 
diff --git a/client/allocrunner/taskrunner/template/template_test.go b/client/allocrunner/taskrunner/template/template_test.go
index 66d7e8b4c21..4811e364a5b 100644
--- a/client/allocrunner/taskrunner/template/template_test.go
+++ b/client/allocrunner/taskrunner/template/template_test.go
@@ -59,16 +59,6 @@ const (
 	TestTaskName = "test-task"
 )
 
-// mockExecutor implements script executor interface
-type mockExecutor struct {
-	DesiredExit int
-	DesiredErr  error
-}
-
-func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
-	return []byte{}, m.DesiredExit, m.DesiredErr
-}
-
 // testHarness is used to test the TaskTemplateManager by spinning up
 // Consul/Vault as needed
 type testHarness struct {
@@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) {
 	key2 := "bar"
 	content1_1 := "cat"
 	content1_2 := "dog"
-	content1_3 := "goldfish"
+
 	t1 := &structs.Template{
 		EmbeddedTmpl: `
 FOO={{key "bam"}}
@@ -1176,10 +1166,9 @@ BAR={{key "bar"}}
 		Envvars: true,
 	}
 
-	me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
 	harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
+	harness.mockHooks.SetupExecTest(0, nil)
 	harness.start(t)
-	harness.manager.SetDriverHandle(&me)
 	defer harness.stop()
 
 	// Ensure no unblock
@@ -1220,29 +1209,6 @@ OUTER:
 			t.Fatal(t, "should have received an event")
 		}
 	}
-
-	// remove the reference to the task handle and update the template contents
-	// again
-	harness.manager.SetDriverHandle(nil)
-	harness.consul.SetKV(t, key1, []byte(content1_3))
-	timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second)
-
-OUTER2:
-	for {
-		select {
-		case <-harness.mockHooks.RestartCh:
-			t.Fatal(t, "restart not expected")
-		case ev := <-harness.mockHooks.EmitEventCh:
-			if strings.Contains(
-				ev.DisplayMessage, "task driver handle is not available") {
-				break OUTER2
-			}
-		case <-harness.mockHooks.SignalCh:
-			t.Fatal(t, "signal not expected")
-		case <-timeout:
-			t.Fatal(t, "should have received an event that task driver handle is unavailable")
-		}
-	}
 }
 
 // TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the
@@ -1285,10 +1251,9 @@ BAR={{key "bar"}}
 		Envvars: true,
 	}
 
-	me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")}
 	harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
+	harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed"))
 	harness.start(t)
-	harness.manager.SetDriverHandle(&me)
 	defer harness.stop()
 
 	// Ensure no unblock
@@ -1365,10 +1330,9 @@ COMMON={{key "common"}}
 		templateScript,
 	}
 
-	me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
 	harness := newTestHarness(t, templates, true, false)
+	harness.mockHooks.SetupExecTest(0, nil)
 	harness.start(t)
-	harness.manager.SetDriverHandle(&me)
 	defer harness.stop()
 
 	// Ensure no unblock
diff --git a/client/allocrunner/taskrunner/template_hook.go b/client/allocrunner/taskrunner/template_hook.go
index c61a8503261..51048e96150 100644
--- a/client/allocrunner/taskrunner/template_hook.go
+++ b/client/allocrunner/taskrunner/template_hook.go
@@ -55,11 +55,6 @@ type templateHookConfig struct {
 
 	// hookResources are used to fetch Consul tokens
 	hookResources *cstructs.AllocHookResources
-
-	// driverHandle is the task driver executor used to run scripts when the
-	// template change mode is set to script. Typically this will be nil in this
-	// config struct, unless we're restoring a task after a client restart.
-	driverHandle ti.ScriptExecutor
 }
 
 type templateHook struct {
@@ -72,15 +67,6 @@ type templateHook struct {
 	templateManager *template.TaskTemplateManager
 	managerLock     sync.Mutex
 
-	// driverHandle is the task driver executor used by the template manager to
-	// run scripts when the template change mode is set to script. This value is
-	// set in the Poststart hook after the task has run, or passed in as
-	// configuration if this is a task that's being restored after a client
-	// restart.
-	//
-	// Must obtain a managerLock before changing. It may be nil.
-	driverHandle ti.ScriptExecutor
-
 	// consulNamespace is the current Consul namespace
 	consulNamespace string
 
@@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook {
 		config:          config,
 		consulNamespace: config.consulNamespace,
 		logger:          config.logger.Named(templateHookName),
-		driverHandle:    config.driverHandle,
 	}
 }
 
@@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar
 	return nil
 }
 
-func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error {
-	h.managerLock.Lock()
-	defer h.managerLock.Unlock()
-
-	if h.templateManager == nil {
-		return nil
-	}
-
-	if req.DriverExec != nil {
-		h.driverHandle = req.DriverExec
-		h.templateManager.SetDriverHandle(h.driverHandle)
-	} else {
-		for _, tmpl := range h.config.templates {
-			if tmpl.ChangeMode == structs.TemplateChangeModeScript {
-				return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available")
-			}
-		}
-	}
-	return nil
-}
-
 func (h *templateHook) newManager() (unblock chan struct{}, err error) {
 	unblock = make(chan struct{})
 
@@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) {
 	}
 
 	h.templateManager = m
-	if h.driverHandle != nil {
-		h.templateManager.SetDriverHandle(h.driverHandle)
-	}
 	return unblock, nil
 }
 
diff --git a/client/allocrunner/taskrunner/template_hook_test.go b/client/allocrunner/taskrunner/template_hook_test.go
index 94f6c7e3c63..056c6a00193 100644
--- a/client/allocrunner/taskrunner/template_hook_test.go
+++ b/client/allocrunner/taskrunner/template_hook_test.go
@@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) {
 				hookResources: tt.hr,
 			}
 			h := &templateHook{
-				config:       conf,
-				logger:       logger,
-				managerLock:  sync.Mutex{},
-				driverHandle: nil,
+				config:      conf,
+				logger:      logger,
+				managerLock: sync.Mutex{},
 			}
 			req := &interfaces.TaskPrestartRequest{
 				Alloc:   a,
@@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
 	envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region)
 
 	lifecycle := trtesting.NewMockTaskHooks()
+	lifecycle.SetupExecTest(117, fmt.Errorf("oh no"))
 	lifecycle.HasHandle = true
 
 	events := &trtesting.MockEmitter{}
 
-	executor := &simpleExec{
-		code: 117,
-		err:  fmt.Errorf("oh no"),
-	}
-
 	hook := newTemplateHook(&templateHookConfig{
 		alloc:     alloc,
 		logger:    logger,
@@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
 		clientConfig:  clientConfig,
 		envBuilder:    envBuilder,
 		hookResources: &cstructs.AllocHookResources{},
-		driverHandle:  executor,
 	})
 	req := &interfaces.TaskPrestartRequest{
 		Alloc:   alloc,
@@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
 	gotEvents := events.Events()
 	must.Len(t, 1, gotEvents)
 	must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type)
-	must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117",
+	must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117",
 		gotEvents[0].DisplayMessage)
 
 }
diff --git a/client/allocrunner/taskrunner/testing/testing.go b/client/allocrunner/taskrunner/testing/testing.go
index 3760d77982f..fa32bc1c328 100644
--- a/client/allocrunner/taskrunner/testing/testing.go
+++ b/client/allocrunner/taskrunner/testing/testing.go
@@ -6,6 +6,7 @@ package testing
 import (
 	"context"
 	"sync"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/structs"
 )
@@ -49,6 +50,9 @@ type MockTaskHooks struct {
 	EmitEventCh chan *structs.TaskEvent
 	events      []*structs.TaskEvent
 
+	execCode int
+	execErr  error
+
 	// HasHandle can be set to simulate restoring a task after client restart
 	HasHandle bool
 }
@@ -105,6 +109,21 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
 	return nil
 }
 
+func (m *MockTaskHooks) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+
+	return []byte{}, m.execCode, m.execErr
+}
+
+func (m *MockTaskHooks) SetupExecTest(code int, err error) {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+
+	m.execCode = code
+	m.execErr = err
+}
+
 func (m *MockTaskHooks) IsRunning() bool {
 	return m.HasHandle
 }