From e1bbf300d331667a61434ace532415989feef7b2 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 22 Dec 2021 16:49:56 -0500 Subject: [PATCH 1/2] task runner: fix goroutine leak in prestart hook The task runner prestart hooks take a `joincontext` so they have the option to exit early if either of two contexts are canceled: from killing the task or client shutdown. Some tasks exit without being shutdown from the server, so neither of the joined contexts ever gets canceled and we leak the `joincontext` (48 bytes) and its internal goroutine. This primarily impacts batch jobs and any task that fails or completes early such as non-sidecar prestart lifecycle tasks. Cancel the `joincontext` after the prestart call exits to fix the leak. --- .changelog/11741.txt | 3 +++ client/allocrunner/taskrunner/task_runner_hooks.go | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 .changelog/11741.txt diff --git a/.changelog/11741.txt b/.changelog/11741.txt new file mode 100644 index 00000000000..1302fc2e3e3 --- /dev/null +++ b/.changelog/11741.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a memory and goroutine leak for batch tasks and any task that exits without being shut down from the server +``` diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index c9ff34441ed..d16eda92802 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -235,9 +235,11 @@ func (tr *TaskRunner) prestart() error { } // Run the prestart hook - // use a joint context to allow any blocking pre-start hooks + // use a join context to allow any blocking pre-start hooks // to be canceled by either killCtx or shutdownCtx - joinedCtx, _ := joincontext.Join(tr.killCtx, tr.shutdownCtx) + joinedCtx, joinedCancel := joincontext.Join(tr.killCtx, tr.shutdownCtx) + defer joinedCancel() + var resp interfaces.TaskPrestartResponse if err := pre.Prestart(joinedCtx, &req, &resp); err != nil { tr.emitHookError(err, name) From 7406e6d86283c0c4d43210a31cd7492351fb26dc Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 23 Dec 2021 11:21:24 -0500 Subject: [PATCH 2/2] move joincontext to top of loop --- client/allocrunner/interfaces/task_lifecycle.go | 4 +++- client/allocrunner/taskrunner/task_runner_hooks.go | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go index ee99a507bd5..e02b1366faa 100644 --- a/client/allocrunner/interfaces/task_lifecycle.go +++ b/client/allocrunner/interfaces/task_lifecycle.go @@ -89,7 +89,9 @@ type TaskPrestartHook interface { // Prestart is called before the task is started including after every // restart. Prestart is not called if the allocation is terminal. // - // The context is cancelled if the task is killed or shutdown. + // The context is cancelled if the task is killed or shutdown but + // should not be stored any persistent goroutines this Prestart + // creates. Prestart(context.Context, *TaskPrestartRequest, *TaskPrestartResponse) error } diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index d16eda92802..a92194d48ec 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -190,6 +190,11 @@ func (tr *TaskRunner) prestart() error { }() } + // use a join context to allow any blocking pre-start hooks + // to be canceled by either killCtx or shutdownCtx + joinedCtx, joinedCancel := joincontext.Join(tr.killCtx, tr.shutdownCtx) + defer joinedCancel() + for _, hook := range tr.runnerHooks { pre, ok := hook.(interfaces.TaskPrestartHook) if !ok { @@ -235,11 +240,6 @@ func (tr *TaskRunner) prestart() error { } // Run the prestart hook - // use a join context to allow any blocking pre-start hooks - // to be canceled by either killCtx or shutdownCtx - joinedCtx, joinedCancel := joincontext.Join(tr.killCtx, tr.shutdownCtx) - defer joinedCancel() - var resp interfaces.TaskPrestartResponse if err := pre.Prestart(joinedCtx, &req, &resp); err != nil { tr.emitHookError(err, name)