Skip to content

Commit

Permalink
Merge pull request #3496 from hashicorp/b-auto-revert-stable
Browse files Browse the repository at this point in the history
Makes auto reverts robust against infinite revert cycles
  • Loading branch information
Preetha authored Nov 3, 2017
2 parents a94bab6 + 04feeef commit df0e6ac
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 4 deletions.
92 changes: 91 additions & 1 deletion nomad/deployment_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ func TestDeploymentEndpoint_Fail_Rollback(t *testing.T) {
// Create the second job, deployment and alloc
j2 := j.Copy()
j2.Stable = false
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"

d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
Expand Down Expand Up @@ -792,7 +794,8 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
// Create the second job, deployment and alloc
j2 := j.Copy()
j2.Stable = false

// Modify the job to make its specification different
j2.Meta["foo"] = "bar"
d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
d.JobID = j2.ID
Expand Down Expand Up @@ -857,6 +860,93 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
assert.EqualValues(2, jout.Version, "reverted job version")
}

// tests rollback upon alloc health failure to job with identical spec does not succeed
func TestDeploymentEndpoint_SetAllocHealth_NoRollback(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.NumSchedulers = 0 // Prevent automatic dequeue
})
defer s1.Shutdown()
codec := rpcClient(t, s1)
testutil.WaitForLeader(t, s1.RPC)
assert := assert.New(t)
state := s1.fsm.State()

// Create the original job
j := mock.Job()
j.Stable = true
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
assert.Nil(state.UpsertJob(998, j), "UpsertJob")

// Create the second job, deployment and alloc. Job has same spec as original
j2 := j.Copy()
j2.Stable = false

d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
d.JobID = j2.ID
d.JobVersion = j2.Version

a := mock.Alloc()
a.JobID = j.ID
a.DeploymentID = d.ID

assert.Nil(state.UpsertJob(999, j2), "UpsertJob")
assert.Nil(state.UpsertDeployment(1000, d), "UpsertDeployment")
assert.Nil(state.UpsertAllocs(1001, []*structs.Allocation{a}), "UpsertAllocs")

// Set the alloc as unhealthy
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
WriteRequest: structs.WriteRequest{Region: "global"},
}

// Fetch the response
var resp structs.DeploymentUpdateResponse
assert.Nil(msgpackrpc.CallWithCodec(codec, "Deployment.SetAllocHealth", req, &resp), "RPC")
assert.NotZero(resp.Index, "bad response index")
assert.Nil(resp.RevertedJobVersion, "revert version must be nil")

// Lookup the evaluation
ws := memdb.NewWatchSet()
eval, err := state.EvalByID(ws, resp.EvalID)
assert.Nil(err, "EvalByID failed")
assert.NotNil(eval, "Expect eval")
assert.Equal(eval.CreateIndex, resp.EvalCreateIndex, "eval index mismatch")
assert.Equal(eval.TriggeredBy, structs.EvalTriggerDeploymentWatcher, "eval trigger")
assert.Equal(eval.JobID, d.JobID, "eval job id")
assert.Equal(eval.DeploymentID, d.ID, "eval deployment id")
assert.Equal(eval.Status, structs.EvalStatusPending, "eval status")

// Lookup the deployment
expectedDesc := structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0)
dout, err := state.DeploymentByID(ws, d.ID)
assert.Nil(err, "DeploymentByID failed")
assert.Equal(dout.Status, structs.DeploymentStatusFailed, "wrong status")
assert.Equal(dout.StatusDescription, expectedDesc, "wrong status description")
assert.Equal(resp.DeploymentModifyIndex, dout.ModifyIndex, "wrong modify index")
assert.Len(dout.TaskGroups, 1, "should have one group")
assert.Contains(dout.TaskGroups, "web", "should have web group")
assert.Equal(1, dout.TaskGroups["web"].UnhealthyAllocs, "should have one healthy")

// Lookup the allocation
aout, err := state.AllocByID(ws, a.ID)
assert.Nil(err, "AllocByID")
assert.NotNil(aout, "alloc")
assert.NotNil(aout.DeploymentStatus, "alloc deployment status")
assert.NotNil(aout.DeploymentStatus.Healthy, "alloc deployment healthy")
assert.False(*aout.DeploymentStatus.Healthy, "alloc deployment healthy")

// Lookup the job, its version should not have changed
jout, err := state.JobByID(ws, j.Namespace, j.ID)
assert.Nil(err, "JobByID")
assert.NotNil(jout, "job")
assert.EqualValues(1, jout.Version, "original job version")
}

func TestDeploymentEndpoint_List(t *testing.T) {
t.Parallel()
s1 := testServer(t, nil)
Expand Down
21 changes: 18 additions & 3 deletions nomad/deploymentwatcher/deployment_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func (w *deploymentWatcher) SetAllocHealth(
}

if j != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
j, desc = w.handleRollbackValidity(j, desc)
}
break
}
Expand Down Expand Up @@ -185,6 +185,21 @@ func (w *deploymentWatcher) SetAllocHealth(
return nil
}

// handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job
// Returns a modified description and job accordingly.
func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) {
// Only rollback if job being changed has a different spec.
// This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback
// If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed.
if w.j.SpecChanged(rollbackJob) {
desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
} else {
desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version)
rollbackJob = nil
}
return rollbackJob, desc
}

func (w *deploymentWatcher) PromoteDeployment(
req *structs.DeploymentPromoteRequest,
resp *structs.DeploymentUpdateResponse) error {
Expand Down Expand Up @@ -265,7 +280,7 @@ func (w *deploymentWatcher) FailDeployment(
}

if rollbackJob != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc)
} else {
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
}
Expand Down Expand Up @@ -371,7 +386,7 @@ func (w *deploymentWatcher) watch() {
// Description should include that the job is being rolled back to
// version N
if j != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
j, desc = w.handleRollbackValidity(j, desc)
} else {
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
}
Expand Down
174 changes: 174 additions & 0 deletions nomad/deploymentwatcher/deployments_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
// Upsert the job again to get a new version
j2 := j.Copy()
j2.Stable = false
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"

assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
Expand Down Expand Up @@ -316,6 +319,66 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
}

// Test setting allocation unhealthy on job with identical spec and there should be no rollback
func TestWatcher_SetAllocHealth_Unhealthy_NoRollback(t *testing.T) {
t.Parallel()
assert := assert.New(t)
w, m := defaultTestDeploymentWatcher(t)

// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].AutoRevert = true
a := mock.Alloc()
a.DeploymentID = d.ID
assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")

// Upsert the job again to get a new version
j2 := j.Copy()
j2.Stable = false

assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })

// Assert that we get a call to UpsertDeploymentAllocHealth
matchConfig := &matchDeploymentAllocHealthRequestConfig{
DeploymentID: d.ID,
Unhealthy: []string{a.ID},
Eval: true,
DeploymentUpdate: &structs.DeploymentStatusUpdate{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionFailedAllocations,
},
JobVersion: nil,
}
matcher := matchDeploymentAllocHealthRequest(matchConfig)
m.On("UpdateDeploymentAllocHealth", mocker.MatchedBy(matcher)).Return(nil)

// Call SetAllocHealth
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
}
var resp structs.DeploymentUpdateResponse
err := w.SetAllocHealth(req, &resp)
assert.Nil(err, "SetAllocHealth")

testutil.WaitForResult(func() (bool, error) { return 0 == len(w.watchers), nil },
func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
}

// Test promoting a deployment
func TestWatcher_PromoteDeployment_HealthyCanaries(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -639,6 +702,8 @@ func TestDeploymentWatcher_Watch(t *testing.T) {

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

Expand Down Expand Up @@ -734,6 +799,115 @@ func TestDeploymentWatcher_Watch(t *testing.T) {
func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
}

// Tests that the watcher fails rollback when the spec hasn't changed
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
t.Parallel()
assert := assert.New(t)
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)

// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].AutoRevert = true
a := mock.Alloc()
a.DeploymentID = d.ID
assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })

// Assert that we will get a createEvaluation call only once. This will
// verify that the watcher is batching allocation changes
m1 := matchUpsertEvals([]string{d.ID})
m.On("UpsertEvals", mocker.MatchedBy(m1)).Return(nil).Once()

// Update the allocs health to healthy which should create an evaluation
for i := 0; i < 5; i++ {
req := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
HealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req), "UpsertDeploymentAllocHealth")
}

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 1 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

// Assert that we get a call to UpsertDeploymentStatusUpdate with roll back failed as the status
c := &matchDeploymentStatusUpdateConfig{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0),
JobVersion: nil,
Eval: true,
}
m2 := matchDeploymentStatusUpdateRequest(c)
m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)

// Update the allocs health to unhealthy which will cause attempting a rollback,
// fail in that step, do status update and eval
req2 := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req2), "UpsertDeploymentAllocHealth")

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 2 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

m.AssertCalled(t, "UpsertEvals", mocker.MatchedBy(m1))

// verify that the job version hasn't changed after upsert
m.state.JobByID(nil, structs.DefaultNamespace, j.ID)
assert.Equal(uint64(0), j.Version, "Expected job version 0 but got ", j.Version)
}

// Test evaluations are batched between watchers
func TestWatcher_BatchEvals(t *testing.T) {
t.Parallel()
Expand Down
6 changes: 6 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4488,6 +4488,12 @@ func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint
return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion)
}

// DeploymentStatusDescriptionRollbackNoop is used to get the status description of
// a deployment when rolling back is not possible because it has the same specification
func DeploymentStatusDescriptionRollbackNoop(baseDescription string, jobVersion uint64) string {
return fmt.Sprintf("%s - not rolling back to stable job version %d as current job has same specification", baseDescription, jobVersion)
}

// DeploymentStatusDescriptionNoRollbackTarget is used to get the status description of
// a deployment when there is no target to rollback to but autorevet is desired.
func DeploymentStatusDescriptionNoRollbackTarget(baseDescription string) string {
Expand Down

0 comments on commit df0e6ac

Please sign in to comment.