Skip to content

Commit

Permalink
alloc_runner: wait when starting suspicious allocs
Browse files Browse the repository at this point in the history
This commit aims to help users running with clients suseptible to the
destroyed alloc being restrarted bug upgrade to latest.  Without this,
such users will have their tasks run unexpectedly on upgrade and only
see the bug resolved after subsequent restart.

If, on restore, the client sees a pending alloc without any other
persisted info, then err on the side that it's an corrupt persisted
state of an alloc instead of the client happening to be killed right
when alloc is assigned to client.

Few reasons motivate this behavior:

Statistically speaking, corruption being the cause is more likely.  A
long running client will have higher chance of having allocs persisted
incorrectly with pending state.  Being killed right when an alloc is
about to start is relatively unlikely.

Also, delaying starting an alloc that hasn't started (by hopefully
seconds) is not as severe as launching too many allocs that may bring
client down.

More importantly, this helps customers upgrade their clients without
risking taking their clients down and destablizing their cluster. We
don't want existing users to force triggering the bug while they upgrade
and restart cluster.
  • Loading branch information
Mahmood Ali committed Aug 27, 2019
1 parent f616370 commit cbc521e
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 0 deletions.
56 changes: 56 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ type allocRunner struct {
// servers have been contacted for the first time in case of a failed
// restore.
serversContactedCh chan struct{}

// waitOnServers defaults to false but will be set true if a restore
// fails and the Run method should wait until serversContactedCh is
// closed.
waitOnServers bool
}

// NewAllocRunner returns a new allocation runner.
Expand Down Expand Up @@ -243,6 +248,16 @@ func (ar *allocRunner) Run() {
// Start the alloc update handler
go ar.handleAllocUpdates()

if ar.waitOnServers {
ar.logger.Info(" waiting to contact server before restarting")
select {
case <-ar.taskStateUpdateHandlerCh:
return
case <-ar.serversContactedCh:
ar.logger.Info("server contacted; unblocking waiting alloc")
}
}

// If task update chan has been closed, that means we've been shutdown.
select {
case <-ar.taskStateUpdateHandlerCh:
Expand Down Expand Up @@ -353,9 +368,50 @@ func (ar *allocRunner) Restore() error {
}
}

ar.waitOnServers = ar.shouldWaitForServers(ds)
return nil
}

// shouldWaitForServers returns true if we suspect the alloc
// is potentially a completed alloc that got resurrected after AR was destroyed.
// In such cases, rerunning the alloc can lead to process and task exhaustion.
//
// The heaurstic used here is an alloc is suspect if it's in a pending state
// and no other task/status info is found.
//
// See:
// * https://github.com/hashicorp/nomad/pull/6207
// * https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (ar *allocRunner) shouldWaitForServers(ds *structs.AllocDeploymentStatus) bool {
alloc := ar.Alloc()

if alloc.ClientStatus != structs.AllocClientStatusPending {
return false
}

// check if we restore a task but see no other data
if ds != nil {
return false
}

tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
// corrupt alloc?!
return true
}

for _, task := range tg.Tasks {
ls, tr, _ := ar.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
if ls != nil || tr != nil {
return false
}
}

return true
}

// persistDeploymentStatus stores AllocDeploymentStatus.
func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
Expand Down
41 changes: 41 additions & 0 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1059,3 +1059,44 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
require.NoError(t, err)
require.Nil(t, ts)
}

// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func TestAllocRunner_WaitForServer_Detects_Suspicious_Allocs(t *testing.T) {
t.Parallel()
alloc := mock.BatchAlloc()

conf, cleanup := testAllocRunnerConfig(t, alloc)
conf.StateDB = state.NewMemDB(conf.Logger)

defer cleanup()
ar, err := NewAllocRunner(conf)
require.NoError(t, err)
defer destroy(ar)

defer destroy(ar)
go ar.Run()

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timed out waiting for alloc to complete")
}

// shouldn't wait after successful completion
require.False(t, ar.shouldWaitForServers(nil))

// new alloc runner shouldn't restore completed alloc
ar, err = NewAllocRunner(conf)
require.NoError(t, err)
ar.Restore()
require.False(t, ar.shouldWaitForServers(nil))

// simulate 0.9.5 behavior
require.NoError(t, conf.StateDB.DeleteAllocationBucket(alloc.ID))
require.NoError(t, conf.StateDB.PutAllocation(alloc))

ar, err = NewAllocRunner(conf)
require.NoError(t, err)
ar.Restore()
require.True(t, ar.shouldWaitForServers(nil))
}

0 comments on commit cbc521e

Please sign in to comment.