Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

alloc_runner: wait when starting suspicious allocs #6216

Merged
merged 3 commits into from
Aug 28, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ type allocRunner struct {
// servers have been contacted for the first time in case of a failed
// restore.
serversContactedCh chan struct{}

// waitOnServers defaults to false but will be set true if a restore
// fails and the Run method should wait until serversContactedCh is
// closed.
waitOnServers bool
}

// NewAllocRunner returns a new allocation runner.
Expand Down Expand Up @@ -243,6 +248,16 @@ func (ar *allocRunner) Run() {
// Start the alloc update handler
go ar.handleAllocUpdates()

if ar.waitOnServers {
ar.logger.Info(" waiting to contact server before restarting")
notnoop marked this conversation as resolved.
Show resolved Hide resolved
select {
case <-ar.taskStateUpdateHandlerCh:
return
case <-ar.serversContactedCh:
ar.logger.Info("server contacted; unblocking waiting alloc")
}
notnoop marked this conversation as resolved.
Show resolved Hide resolved
}

// If task update chan has been closed, that means we've been shutdown.
select {
case <-ar.taskStateUpdateHandlerCh:
Expand Down Expand Up @@ -353,9 +368,50 @@ func (ar *allocRunner) Restore() error {
}
}

ar.waitOnServers = ar.shouldWaitForServers(ds)
return nil
}

// shouldWaitForServers returns true if we suspect the alloc
// is potentially a completed alloc that got resurrected after AR was destroyed.
// In such cases, rerunning the alloc can lead to process and task exhaustion.
//
// The heaurstic used here is an alloc is suspect if it's in a pending state
notnoop marked this conversation as resolved.
Show resolved Hide resolved
// and no other task/status info is found.
//
// See:
// * https://github.com/hashicorp/nomad/pull/6207
// * https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (ar *allocRunner) shouldWaitForServers(ds *structs.AllocDeploymentStatus) bool {
alloc := ar.Alloc()

if alloc.ClientStatus != structs.AllocClientStatusPending {
return false
}

// check if we restore a task but see no other data
if ds != nil {
return false
}

tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
// corrupt alloc?!
return true
}

for _, task := range tg.Tasks {
ls, tr, _ := ar.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
if ls != nil || tr != nil {
return false
}
}

return true
}

// persistDeploymentStatus stores AllocDeploymentStatus.
func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
Expand Down
41 changes: 41 additions & 0 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1059,3 +1059,44 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
require.NoError(t, err)
require.Nil(t, ts)
}

// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func TestAllocRunner_WaitForServer_Detects_Suspicious_Allocs(t *testing.T) {
t.Parallel()
alloc := mock.BatchAlloc()

conf, cleanup := testAllocRunnerConfig(t, alloc)
conf.StateDB = state.NewMemDB(conf.Logger)

defer cleanup()
ar, err := NewAllocRunner(conf)
require.NoError(t, err)
defer destroy(ar)

defer destroy(ar)
go ar.Run()

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timed out waiting for alloc to complete")
}

// shouldn't wait after successful completion
require.False(t, ar.shouldWaitForServers(nil))

// new alloc runner shouldn't restore completed alloc
ar, err = NewAllocRunner(conf)
require.NoError(t, err)
ar.Restore()
require.False(t, ar.shouldWaitForServers(nil))

// simulate 0.9.5 behavior
require.NoError(t, conf.StateDB.DeleteAllocationBucket(alloc.ID))
require.NoError(t, conf.StateDB.PutAllocation(alloc))

ar, err = NewAllocRunner(conf)
require.NoError(t, err)
ar.Restore()
require.True(t, ar.shouldWaitForServers(nil))
}