Skip to content

Commit

Permalink
template: fix panic in change_mode=script on client restart
Browse files Browse the repository at this point in the history
When we introduced change_mode=script to templates, we passed the driver handle
down into the template manager so we could call its `Exec` method directly. But
the lifecycle of the driver handle is managed by the taskrunner and isn't
available when the template manager is first created. This has led to a series
of patches trying to fixup the behavior (#15915, #15192, #23663, #23917). Part
of the challenge in getting this right is using an interface to avoid the
circular import of the driver handle.

But the taskrunner already has a way to deal with this problem using a "lazy
handle". The other template change modes already use this indirectly through the
`Lifecycle` interface. Change the driver handle `Exec` call in the template
manager to a new `Lifecycle.Exec` call that reuses the existing behavior. This
eliminates the need for the template manager to know anything at all about the
handle state.

Fixes: #24051
  • Loading branch information
tgross committed Sep 24, 2024
1 parent 338487c commit 8e9cb77
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 119 deletions.
3 changes: 3 additions & 0 deletions .changelog/24057.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
template: Fixed a panic on client restart when using change_mode=script
```
1 change: 1 addition & 0 deletions client/allocrunner/interfaces/task_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type TaskPoststartRequest struct {
// Stats collector
DriverStats DriverStats
}

type TaskPoststartResponse struct{}

type TaskPoststartHook interface {
Expand Down
6 changes: 6 additions & 0 deletions client/allocrunner/taskrunner/driver_handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error {

// Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec.
func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
if h == nil {
return nil, 0, ErrTaskNotRunning
}
command := append([]string{cmd}, args...)
res, err := h.driver.ExecTask(h.taskID, command, timeout)
if err != nil {
Expand All @@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context,
command []string,
tty bool,
stream drivers.ExecTaskStream) error {
if h == nil {
return ErrTaskNotRunning
}

if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok {
return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream)
Expand Down
4 changes: 4 additions & 0 deletions client/allocrunner/taskrunner/interfaces/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package interfaces

import (
"context"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -20,6 +21,9 @@ type TaskLifecycle interface {
// Kill a task permanently.
Kill(ctx context.Context, event *structs.TaskEvent) error

// Exec into a running task.
Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error)

// IsRunning returns true if the task runner has a handle to the task
// driver, which is useful for distinguishing restored tasks during
// prestart hooks. But note that the driver handle could go away after you
Expand Down
13 changes: 13 additions & 0 deletions client/allocrunner/taskrunner/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package taskrunner

import (
"context"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent,
return nil
}

func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
tr.logger.Trace("Exec requested")

handle := tr.getDriverHandle()
if handle == nil {
return nil, 0, ErrTaskNotRunning
}

out, code, err := handle.Exec(timeout, cmd, args)
return out, code, err
}

func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
tr.logger.Trace("Signal requested", "signal", s)

Expand Down
1 change: 0 additions & 1 deletion client/allocrunner/taskrunner/task_runner_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() {
consulNamespace: consulNamespace,
nomadNamespace: tr.alloc.Job.Namespace,
renderOnTaskRestart: task.RestartPolicy.RenderTemplates,
driverHandle: tr.handle,
}))
}

Expand Down
32 changes: 4 additions & 28 deletions client/allocrunner/taskrunner/template/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,6 @@ type TaskTemplateManager struct {
// runner is the consul-template runner
runner *manager.Runner

// handle is used to execute scripts
handle interfaces.ScriptExecutor
handleLock sync.Mutex

// signals is a lookup map from the string representation of a signal to its
// actual signal
signals map[string]os.Signal
Expand Down Expand Up @@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() {
}
}

// SetDriverHandle sets the executor
func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) {
tm.handleLock.Lock()
defer tm.handleLock.Unlock()
tm.handle = executor

}

// run is the long lived loop that handles errors and templates being rendered
func (tm *TaskTemplateManager) run() {
// Runner is nil if there are no templates
Expand Down Expand Up @@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m
func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) {
defer wg.Done()

tm.handleLock.Lock()
defer tm.handleLock.Unlock()
if tm.handle == nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v because task driver handle is not available",
script.Command,
script.Args,
)
tm.handleScriptError(script, failureMsg)
return
}
_, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args)
_, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args)
if err != nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v on change: %v Exit code: %v",
"Template failed to run script %v with arguments %v on change: %v. Exit code: %v",
script.Command,
script.Args,
err,
Expand All @@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
}
if exitCode != 0 {
failureMsg := fmt.Sprintf(
"Template ran script %v with arguments %v on change but it exited with code code: %v",
"Template ran script %v with arguments %v on change but it exited with code: %v",
script.Command,
script.Args,
exitCode,
Expand All @@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage).
SetDisplayMessage(
fmt.Sprintf(
"Template successfully ran script %v with arguments: %v. Exit code: %v",
"Template successfully ran script %v with arguments: %v. Exit code: 0",
script.Command,
script.Args,
exitCode,
)))
}

Expand Down
44 changes: 4 additions & 40 deletions client/allocrunner/taskrunner/template/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,6 @@ const (
TestTaskName = "test-task"
)

// mockExecutor implements script executor interface
type mockExecutor struct {
DesiredExit int
DesiredErr error
}

func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
return []byte{}, m.DesiredExit, m.DesiredErr
}

// testHarness is used to test the TaskTemplateManager by spinning up
// Consul/Vault as needed
type testHarness struct {
Expand Down Expand Up @@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) {
key2 := "bar"
content1_1 := "cat"
content1_2 := "dog"
content1_3 := "goldfish"

t1 := &structs.Template{
EmbeddedTmpl: `
FOO={{key "bam"}}
Expand Down Expand Up @@ -1176,10 +1166,9 @@ BAR={{key "bar"}}
Envvars: true,
}

me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down Expand Up @@ -1220,29 +1209,6 @@ OUTER:
t.Fatal(t, "should have received an event")
}
}

// remove the reference to the task handle and update the template contents
// again
harness.manager.SetDriverHandle(nil)
harness.consul.SetKV(t, key1, []byte(content1_3))
timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second)

OUTER2:
for {
select {
case <-harness.mockHooks.RestartCh:
t.Fatal(t, "restart not expected")
case ev := <-harness.mockHooks.EmitEventCh:
if strings.Contains(
ev.DisplayMessage, "task driver handle is not available") {
break OUTER2
}
case <-harness.mockHooks.SignalCh:
t.Fatal(t, "signal not expected")
case <-timeout:
t.Fatal(t, "should have received an event that task driver handle is unavailable")
}
}
}

// TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the
Expand Down Expand Up @@ -1285,10 +1251,9 @@ BAR={{key "bar"}}
Envvars: true,
}

me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed"))
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down Expand Up @@ -1365,10 +1330,9 @@ COMMON={{key "common"}}
templateScript,
}

me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, templates, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down
39 changes: 0 additions & 39 deletions client/allocrunner/taskrunner/template_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ type templateHookConfig struct {

// hookResources are used to fetch Consul tokens
hookResources *cstructs.AllocHookResources

// driverHandle is the task driver executor used to run scripts when the
// template change mode is set to script. Typically this will be nil in this
// config struct, unless we're restoring a task after a client restart.
driverHandle ti.ScriptExecutor
}

type templateHook struct {
Expand All @@ -72,15 +67,6 @@ type templateHook struct {
templateManager *template.TaskTemplateManager
managerLock sync.Mutex

// driverHandle is the task driver executor used by the template manager to
// run scripts when the template change mode is set to script. This value is
// set in the Poststart hook after the task has run, or passed in as
// configuration if this is a task that's being restored after a client
// restart.
//
// Must obtain a managerLock before changing. It may be nil.
driverHandle ti.ScriptExecutor

// consulNamespace is the current Consul namespace
consulNamespace string

Expand Down Expand Up @@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook {
config: config,
consulNamespace: config.consulNamespace,
logger: config.logger.Named(templateHookName),
driverHandle: config.driverHandle,
}
}

Expand Down Expand Up @@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar
return nil
}

func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error {
h.managerLock.Lock()
defer h.managerLock.Unlock()

if h.templateManager == nil {
return nil
}

if req.DriverExec != nil {
h.driverHandle = req.DriverExec
h.templateManager.SetDriverHandle(h.driverHandle)
} else {
for _, tmpl := range h.config.templates {
if tmpl.ChangeMode == structs.TemplateChangeModeScript {
return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available")
}
}
}
return nil
}

func (h *templateHook) newManager() (unblock chan struct{}, err error) {
unblock = make(chan struct{})

Expand Down Expand Up @@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) {
}

h.templateManager = m
if h.driverHandle != nil {
h.templateManager.SetDriverHandle(h.driverHandle)
}
return unblock, nil
}

Expand Down
16 changes: 5 additions & 11 deletions client/allocrunner/taskrunner/template_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) {
hookResources: tt.hr,
}
h := &templateHook{
config: conf,
logger: logger,
managerLock: sync.Mutex{},
driverHandle: nil,
config: conf,
logger: logger,
managerLock: sync.Mutex{},
}
req := &interfaces.TaskPrestartRequest{
Alloc: a,
Expand Down Expand Up @@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region)

lifecycle := trtesting.NewMockTaskHooks()
lifecycle.SetupExecTest(117, fmt.Errorf("oh no"))
lifecycle.HasHandle = true

events := &trtesting.MockEmitter{}

executor := &simpleExec{
code: 117,
err: fmt.Errorf("oh no"),
}

hook := newTemplateHook(&templateHookConfig{
alloc: alloc,
logger: logger,
Expand All @@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
clientConfig: clientConfig,
envBuilder: envBuilder,
hookResources: &cstructs.AllocHookResources{},
driverHandle: executor,
})
req := &interfaces.TaskPrestartRequest{
Alloc: alloc,
Expand All @@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
gotEvents := events.Events()
must.Len(t, 1, gotEvents)
must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type)
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117",
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117",
gotEvents[0].DisplayMessage)

}
Loading

0 comments on commit 8e9cb77

Please sign in to comment.