From 81a24098f00accae406d252247dd53be98bce19c Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Mon, 6 Jan 2020 15:56:31 -0500 Subject: [PATCH 1/2] Update Evicted allocations to lost when lost If an alloc is being preempted and marked as evict, but the underlying node is lost before the migration takes place, the allocation currently stays as desired evict, status running forever, or until the node comes back online. This commit updates updateNonTerminalAllocsToLost to check for a destired status of Evict as well as Stop when updating allocations on tainted nodes. switch to table test for lost node cases --- scheduler/generic_sched_test.go | 177 ++++++++++++++++++-------------- scheduler/util.go | 7 +- 2 files changed, 103 insertions(+), 81 deletions(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 174d83b2235..f7b06da7e9e 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) { } func TestServiceSched_NodeDown(t *testing.T) { - h := NewHarness(t) - - // Register a node - node := mock.Node() - node.Status = structs.NodeStatusDown - require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - - // Generate a fake job with allocations and an update policy. - job := mock.Job() - require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - - var allocs []*structs.Allocation - for i := 0; i < 10; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - alloc.NodeID = node.ID - alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - allocs = append(allocs, alloc) + cases := []struct { + desired string + client string + migrate bool + reschedule bool + terminal bool + lost bool + }{ + { + desired: structs.AllocDesiredStatusStop, + client: structs.AllocClientStatusRunning, + lost: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusPending, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusRunning, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusLost, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusComplete, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusFailed, + reschedule: true, + }, + { + desired: structs.AllocDesiredStatusEvict, + client: structs.AllocClientStatusRunning, + lost: true, + }, } - // Cover each terminal case and ensure it doesn't change to lost - allocs[7].DesiredStatus = structs.AllocDesiredStatusRun - allocs[7].ClientStatus = structs.AllocClientStatusLost - allocs[8].DesiredStatus = structs.AllocDesiredStatusRun - allocs[8].ClientStatus = structs.AllocClientStatusFailed - allocs[9].DesiredStatus = structs.AllocDesiredStatusRun - allocs[9].ClientStatus = structs.AllocClientStatusComplete - - toBeRescheduled := map[string]bool{allocs[8].ID: true} + for i, tc := range cases { + t.Run(fmt.Sprintf(""), func(t *testing.T) { + h := NewHarness(t) - // Mark some allocs as running - for i := 0; i < 4; i++ { - out := allocs[i] - out.ClientStatus = structs.AllocClientStatusRunning - } + // Register a node + node := mock.Node() + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - // Mark appropriate allocs for migration - toBeMigrated := map[string]bool{} - for i := 0; i < 3; i++ { - out := allocs[i] - out.DesiredTransition.Migrate = helper.BoolToPtr(true) - toBeMigrated[out.ID] = true - } + // Generate a fake job with allocations and an update policy. + job := mock.Job() + require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - toBeLost := map[string]bool{} - for i := len(toBeMigrated); i < 7; i++ { - toBeLost[allocs[i].ID] = true - } + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) + alloc.DesiredStatus = tc.desired + alloc.ClientStatus = tc.client - // Create a mock evaluation to deal with drain - eval := &structs.Evaluation{ - Namespace: structs.DefaultNamespace, - ID: uuid.Generate(), - Priority: 50, - TriggeredBy: structs.EvalTriggerNodeUpdate, - JobID: job.ID, - NodeID: node.ID, - Status: structs.EvalStatusPending, - } - require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + // Mark for migration if necessary + alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate) - // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - require.NoError(t, err) + allocs := []*structs.Allocation{alloc} + require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) - // Ensure a single plan - require.Len(t, h.Plans, 1) - plan := h.Plans[0] - - // Test the scheduler marked all non-terminal allocations as lost - require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled)) + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) - for _, out := range plan.NodeUpdate[node.ID] { - t.Run("alloc "+out.ID, func(t *testing.T) { - require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus) + // Process the evaluation + err := h.Process(NewServiceScheduler, eval) + require.NoError(t, err) - if toBeMigrated[out.ID] { - // there is no indicator on job itself that marks it as migrated - require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeLost[out.ID] { - require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeRescheduled[out.ID] { - require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus) + if tc.terminal { + // No plan for terminal state allocs + require.Len(t, h.Plans, 0) } else { - require.Fail(t, "unexpected alloc update") + require.Len(t, h.Plans, 1) + + plan := h.Plans[0] + out := plan.NodeUpdate[node.ID] + require.Len(t, out, 1) + + outAlloc := out[0] + if tc.migrate { + require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else if tc.reschedule { + require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus) + } else if tc.lost { + require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else { + require.Fail(t, "unexpected alloc update") + } } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) }) } - - h.AssertEvalStatus(t, structs.EvalStatusComplete) } func TestServiceSched_NodeUpdate(t *testing.T) { diff --git a/scheduler/util.go b/scheduler/util.go index 4b6aa46b2f3..c20cb1dc308 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc continue } - // If the scheduler has marked it as stop already but the alloc wasn't - // terminal on the client change the status to lost. - if alloc.DesiredStatus == structs.AllocDesiredStatusStop && + // If the scheduler has marked it as stop or evict already but the alloc + // wasn't terminal on the client change the status to lost. + if (alloc.DesiredStatus == structs.AllocDesiredStatusStop || + alloc.DesiredStatus == structs.AllocDesiredStatusEvict) && (alloc.ClientStatus == structs.AllocClientStatusRunning || alloc.ClientStatus == structs.AllocClientStatusPending) { plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost) From 8d73cda788b75685712e4b54b6bd2256d99d2f53 Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:37:38 -0500 Subject: [PATCH 2/2] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f2f471906d..54da05e3a38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ BUG FIXES: * agent: Fixed race condition in logging when using `nomad monitor` command [[GH-6872](https://github.com/hashicorp/nomad/issues/6872)] * cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)] * consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)] + * scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)] ## 0.10.2 (December 4, 2019)