diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c22c72b0c6..4553b99a074 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ BUG FIXES: * cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)] * config: Fixed a bug where agent startup would fail if the `consul.timeout` configuration was set. [[GH-6907](https://github.com/hashicorp/nomad/issues/6907)] * consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)] + * scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)] ## 0.10.2 (December 4, 2019) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 174d83b2235..f7b06da7e9e 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) { } func TestServiceSched_NodeDown(t *testing.T) { - h := NewHarness(t) - - // Register a node - node := mock.Node() - node.Status = structs.NodeStatusDown - require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - - // Generate a fake job with allocations and an update policy. - job := mock.Job() - require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - - var allocs []*structs.Allocation - for i := 0; i < 10; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - alloc.NodeID = node.ID - alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - allocs = append(allocs, alloc) + cases := []struct { + desired string + client string + migrate bool + reschedule bool + terminal bool + lost bool + }{ + { + desired: structs.AllocDesiredStatusStop, + client: structs.AllocClientStatusRunning, + lost: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusPending, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusRunning, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusLost, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusComplete, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusFailed, + reschedule: true, + }, + { + desired: structs.AllocDesiredStatusEvict, + client: structs.AllocClientStatusRunning, + lost: true, + }, } - // Cover each terminal case and ensure it doesn't change to lost - allocs[7].DesiredStatus = structs.AllocDesiredStatusRun - allocs[7].ClientStatus = structs.AllocClientStatusLost - allocs[8].DesiredStatus = structs.AllocDesiredStatusRun - allocs[8].ClientStatus = structs.AllocClientStatusFailed - allocs[9].DesiredStatus = structs.AllocDesiredStatusRun - allocs[9].ClientStatus = structs.AllocClientStatusComplete - - toBeRescheduled := map[string]bool{allocs[8].ID: true} + for i, tc := range cases { + t.Run(fmt.Sprintf(""), func(t *testing.T) { + h := NewHarness(t) - // Mark some allocs as running - for i := 0; i < 4; i++ { - out := allocs[i] - out.ClientStatus = structs.AllocClientStatusRunning - } + // Register a node + node := mock.Node() + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - // Mark appropriate allocs for migration - toBeMigrated := map[string]bool{} - for i := 0; i < 3; i++ { - out := allocs[i] - out.DesiredTransition.Migrate = helper.BoolToPtr(true) - toBeMigrated[out.ID] = true - } + // Generate a fake job with allocations and an update policy. + job := mock.Job() + require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - toBeLost := map[string]bool{} - for i := len(toBeMigrated); i < 7; i++ { - toBeLost[allocs[i].ID] = true - } + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) + alloc.DesiredStatus = tc.desired + alloc.ClientStatus = tc.client - // Create a mock evaluation to deal with drain - eval := &structs.Evaluation{ - Namespace: structs.DefaultNamespace, - ID: uuid.Generate(), - Priority: 50, - TriggeredBy: structs.EvalTriggerNodeUpdate, - JobID: job.ID, - NodeID: node.ID, - Status: structs.EvalStatusPending, - } - require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + // Mark for migration if necessary + alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate) - // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - require.NoError(t, err) + allocs := []*structs.Allocation{alloc} + require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) - // Ensure a single plan - require.Len(t, h.Plans, 1) - plan := h.Plans[0] - - // Test the scheduler marked all non-terminal allocations as lost - require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled)) + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) - for _, out := range plan.NodeUpdate[node.ID] { - t.Run("alloc "+out.ID, func(t *testing.T) { - require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus) + // Process the evaluation + err := h.Process(NewServiceScheduler, eval) + require.NoError(t, err) - if toBeMigrated[out.ID] { - // there is no indicator on job itself that marks it as migrated - require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeLost[out.ID] { - require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeRescheduled[out.ID] { - require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus) + if tc.terminal { + // No plan for terminal state allocs + require.Len(t, h.Plans, 0) } else { - require.Fail(t, "unexpected alloc update") + require.Len(t, h.Plans, 1) + + plan := h.Plans[0] + out := plan.NodeUpdate[node.ID] + require.Len(t, out, 1) + + outAlloc := out[0] + if tc.migrate { + require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else if tc.reschedule { + require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus) + } else if tc.lost { + require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else { + require.Fail(t, "unexpected alloc update") + } } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) }) } - - h.AssertEvalStatus(t, structs.EvalStatusComplete) } func TestServiceSched_NodeUpdate(t *testing.T) { diff --git a/scheduler/util.go b/scheduler/util.go index 4b6aa46b2f3..c20cb1dc308 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc continue } - // If the scheduler has marked it as stop already but the alloc wasn't - // terminal on the client change the status to lost. - if alloc.DesiredStatus == structs.AllocDesiredStatusStop && + // If the scheduler has marked it as stop or evict already but the alloc + // wasn't terminal on the client change the status to lost. + if (alloc.DesiredStatus == structs.AllocDesiredStatusStop || + alloc.DesiredStatus == structs.AllocDesiredStatusEvict) && (alloc.ClientStatus == structs.AllocClientStatusRunning || alloc.ClientStatus == structs.AllocClientStatusPending) { plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)