Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fail alloc if alloc runner prestart hooks fail #5905

Merged
merged 2 commits into from
Jul 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,18 @@ func (ar *allocRunner) Run() {
default:
}

// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
// to perform any cleanup that's necessary, potentially not done prior to earlier termination

// Run the prestart hooks if non-terminal
if ar.shouldRun() {
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)

for _, tr := range ar.tasks {
tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err))
}

goto POST
}
}
Expand Down
76 changes: 76 additions & 0 deletions client/allocrunner/alloc_runner_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
events := ar2.AllocState().TaskStates[task.Name].Events
require.Equal(t, initialRunEvents, events)
}

// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
// prestart hooks failed, then the alloc and subsequent tasks transition
// to failed state
func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
alloc.Job.Type = structs.JobTypeBatch
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "mock_driver"
task.Config = map[string]interface{}{
"run_for": "2ms",
}
alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
Attempts: 0,
}

conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
defer cleanup()

// Maintain state for subsequent run
conf.StateDB = state.NewMemDB(conf.Logger)

// Start and wait for task to be running
ar, err := NewAllocRunner(conf)
require.NoError(t, err)

ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})

go ar.Run()
defer destroy(ar)

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "alloc.waitCh wasn't closed")
}

testutil.WaitForResult(func() (bool, error) {
s := ar.AllocState()
if s.ClientStatus != structs.AllocClientStatusFailed {
return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
}
return true, nil
}, func(err error) {
require.NoError(t, err)
})

// once job finishes, it shouldn't run again
require.False(t, ar.shouldRun())
initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
require.Len(t, initialRunEvents, 2)

ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
require.NoError(t, err)
require.NotNil(t, ls)
require.NotNil(t, ts)
require.Equal(t, structs.TaskStateDead, ts.State)
require.True(t, ts.Failed)

// TR waitCh must be closed too!
select {
case <-ar.tasks[task.Name].WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "tr.waitCh wasn't closed")
}
}

type allocFailingPrestartHook struct{}

func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }

func (*allocFailingPrestartHook) Prerun() error {
return fmt.Errorf("failing prestart hooks")
}
27 changes: 27 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,33 @@ func (tr *TaskRunner) initLabels() {
}
}

// Mark a task as failed and not to run. Aimed to be invoked when alloc runner
// prestart hooks failed.
// Should never be called with Run().
func (tr *TaskRunner) MarkFailedDead(reason string) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, we introduce another function to call instead of Run() breaking an invariant; but I had a very hard time rationalizing reusing Run() when we never want to call any of the logic there and we would need to signal that the task should fail.

I felt that MarkFailedDead is a reasonable compromise that does the bare minimum to mark task as failed.

defer close(tr.waitCh)

tr.stateLock.Lock()
if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
//TODO Nomad will be unable to restore this task; try to kill
// it now and fail? In general we prefer to leave running
// tasks running even if the agent encounters an error.
tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
"error", err)
}
tr.stateLock.Unlock()

event := structs.NewTaskEvent(structs.TaskSetupFailure).
SetDisplayMessage(reason).
SetFailsTask()
tr.UpdateState(structs.TaskStateDead, event)

// Run the stop hooks in case task was a restored task that failed prestart
if err := tr.stop(); err != nil {
tr.logger.Error("stop failed while marking task dead", "error", err)
}
}

// Run the TaskRunner. Starts the user's task or reattaches to a restored task.
// Run closes WaitCh when it exits. Should be started in a goroutine.
func (tr *TaskRunner) Run() {
Expand Down