From 380262613d3a6c4722fbfffb54ade8d94b746029 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Sat, 29 Jun 2019 03:56:40 -0500 Subject: [PATCH 1/2] Fail alloc if alloc runner prestart hooks fail When an alloc runner prestart hook fails, the task runners aren't invoked and they remain in a pending state. This leads to terrible results, some of which are: * Lockup in GC process as reported in https://github.com/hashicorp/nomad/pull/5861 * Lockup in shutdown process as TR.Shutdown() waits for WaitCh to be closed * Alloc not being restarted/rescheduled to another node (as it's still in pending state) * Unexpected restart of alloc on a client restart, potentially days/weeks after alloc expected start time! Here, we treat all tasks to have failed if alloc runner prestart hook fails. This fixes the lockups, and permits the alloc to be rescheduled on another node. While it's desirable to retry alloc runner in such failures, I opted to treat it out of scope. I'm afraid of some subtles about alloc and task runners and their idempotency that's better handled in a follow up PR. This might be one of the root causes for https://github.com/hashicorp/nomad/issues/5840 . --- client/allocrunner/alloc_runner.go | 8 +++ client/allocrunner/alloc_runner_unix_test.go | 76 ++++++++++++++++++++ client/allocrunner/taskrunner/task_runner.go | 22 ++++++ 3 files changed, 106 insertions(+) diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 544f9ba6ffa..170dabe2442 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -248,10 +248,18 @@ func (ar *allocRunner) Run() { default: } + // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run + // to perform any cleanup that's necessary, potentially not done prior to earlier termination + // Run the prestart hooks if non-terminal if ar.shouldRun() { if err := ar.prerun(); err != nil { ar.logger.Error("prerun failed", "error", err) + + for _, tr := range ar.tasks { + tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err)) + } + goto POST } } diff --git a/client/allocrunner/alloc_runner_unix_test.go b/client/allocrunner/alloc_runner_unix_test.go index 8c31832e3c4..a438266a495 100644 --- a/client/allocrunner/alloc_runner_unix_test.go +++ b/client/allocrunner/alloc_runner_unix_test.go @@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) { events := ar2.AllocState().TaskStates[task.Name].Events require.Equal(t, initialRunEvents, events) } + +// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc +// prestart hooks failed, then the alloc and subsequent tasks transition +// to failed state +func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + alloc.Job.Type = structs.JobTypeBatch + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "2ms", + } + alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ + Attempts: 0, + } + + conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) + defer cleanup() + + // Maintain state for subsequent run + conf.StateDB = state.NewMemDB(conf.Logger) + + // Start and wait for task to be running + ar, err := NewAllocRunner(conf) + require.NoError(t, err) + + ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{}) + + go ar.Run() + defer destroy(ar) + + select { + case <-ar.WaitCh(): + case <-time.After(10 * time.Second): + require.Fail(t, "alloc.waitCh wasn't closed") + } + + testutil.WaitForResult(func() (bool, error) { + s := ar.AllocState() + if s.ClientStatus != structs.AllocClientStatusFailed { + return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) + } + return true, nil + }, func(err error) { + require.NoError(t, err) + }) + + // once job finishes, it shouldn't run again + require.False(t, ar.shouldRun()) + initialRunEvents := ar.AllocState().TaskStates[task.Name].Events + require.Len(t, initialRunEvents, 2) + + ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) + require.NoError(t, err) + require.NotNil(t, ls) + require.NotNil(t, ts) + require.Equal(t, structs.TaskStateDead, ts.State) + require.True(t, ts.Failed) + + // TR waitCh must be closed too! + select { + case <-ar.tasks[task.Name].WaitCh(): + case <-time.After(10 * time.Second): + require.Fail(t, "tr.waitCh wasn't closed") + } +} + +type allocFailingPrestartHook struct{} + +func (*allocFailingPrestartHook) Name() string { return "failing_prestart" } + +func (*allocFailingPrestartHook) Prerun() error { + return fmt.Errorf("failing prestart hooks") +} diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 233a97b116c..cf99d8c3786 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -388,6 +388,28 @@ func (tr *TaskRunner) initLabels() { } } +// Mark a task as failed and not to run. Aimed to be invoked when alloc runner +// prestart hooks failed. +// Should never be called with Run(). +func (tr *TaskRunner) MarkFailedDead(reason string) { + defer close(tr.waitCh) + + tr.stateLock.Lock() + if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { + //TODO Nomad will be unable to restore this task; try to kill + // it now and fail? In general we prefer to leave running + // tasks running even if the agent encounters an error. + tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", + "error", err) + } + tr.stateLock.Unlock() + + event := structs.NewTaskEvent(structs.TaskSetupFailure). + SetDisplayMessage(reason). + SetFailsTask() + tr.UpdateState(structs.TaskStateDead, event) +} + // Run the TaskRunner. Starts the user's task or reattaches to a restored task. // Run closes WaitCh when it exits. Should be started in a goroutine. func (tr *TaskRunner) Run() { From 99802390c19d950f2ba194dda0d6bf2d9158c717 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 2 Jul 2019 18:37:52 +0800 Subject: [PATCH 2/2] run post-run/post-stop task runner hooks Handle when prestart failed while restoring a task, to prevent accidentally leaking consul/logmon processes. --- client/allocrunner/taskrunner/task_runner.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index cf99d8c3786..ac9d7a1c260 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -408,6 +408,11 @@ func (tr *TaskRunner) MarkFailedDead(reason string) { SetDisplayMessage(reason). SetFailsTask() tr.UpdateState(structs.TaskStateDead, event) + + // Run the stop hooks in case task was a restored task that failed prestart + if err := tr.stop(); err != nil { + tr.logger.Error("stop failed while marking task dead", "error", err) + } } // Run the TaskRunner. Starts the user's task or reattaches to a restored task.