From 5bfb500932d4ac6e08cc71837b3740a264450a19 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 22 May 2024 10:23:08 -0400 Subject: [PATCH] refactor scheduler tests for node down/disconnected (#22198) While working on #20462 #12319 I found that some of our scheduler tests around down nodes or disconnected clients were enforcing invariants that were unclear. This changeset pulls out some minor refactorings so that the bug fix PR is easier to review. This includes: * Migrating a few tests from `testify` to `shoenig/test` that I'm going to touch in #12319 anyways. * Adding test names to the node down test * Update the disconnected client test so that we always re-process the pending/blocked eval it creates; this eliminates 2 redundant sub-tests. * Update the disconnected client test assertions so that they're explicit in the test setup rather than implied by whether we re-process the pending/blocked eval. Ref: https://github.com/hashicorp/nomad/issues/20462 Ref: https://github.com/hashicorp/nomad/pull/12319 --- scheduler/generic_sched_test.go | 221 ++++++++++++++++---------------- 1 file changed, 108 insertions(+), 113 deletions(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 5e0ec00fb80..9d5934979f0 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -18,6 +18,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test" "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -3524,6 +3525,7 @@ func TestServiceSched_NodeDown(t *testing.T) { ci.Parallel(t) cases := []struct { + name string desired string client string migrate bool @@ -3532,36 +3534,43 @@ func TestServiceSched_NodeDown(t *testing.T) { lost bool }{ { + name: "should stop is running should be lost", desired: structs.AllocDesiredStatusStop, client: structs.AllocClientStatusRunning, lost: true, }, { + name: "should run is pending should be migrate", desired: structs.AllocDesiredStatusRun, client: structs.AllocClientStatusPending, migrate: true, }, { + name: "should run is running should be migrate", desired: structs.AllocDesiredStatusRun, client: structs.AllocClientStatusRunning, migrate: true, }, { + name: "should run is lost should be terminal", desired: structs.AllocDesiredStatusRun, client: structs.AllocClientStatusLost, terminal: true, }, { + name: "should run is complete should be terminal", desired: structs.AllocDesiredStatusRun, client: structs.AllocClientStatusComplete, terminal: true, }, { + name: "should run is failed should be rescheduled", desired: structs.AllocDesiredStatusRun, client: structs.AllocClientStatusFailed, reschedule: true, }, { + name: "should evict is running should be lost", desired: structs.AllocDesiredStatusEvict, client: structs.AllocClientStatusRunning, lost: true, @@ -3569,17 +3578,17 @@ func TestServiceSched_NodeDown(t *testing.T) { } for i, tc := range cases { - t.Run(fmt.Sprintf(""), func(t *testing.T) { + t.Run(tc.name, func(t *testing.T) { h := NewHarness(t) // Register a node node := mock.Node() node.Status = structs.NodeStatusDown - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) // Generate a fake job with allocations and an update policy. job := mock.Job() - require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) alloc := mock.Alloc() alloc.Job = job @@ -3594,7 +3603,7 @@ func TestServiceSched_NodeDown(t *testing.T) { alloc.DesiredTransition.Migrate = pointer.Of(tc.migrate) allocs := []*structs.Allocation{alloc} - require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) // Create a mock evaluation to deal with drain eval := &structs.Evaluation{ @@ -3606,31 +3615,30 @@ func TestServiceSched_NodeDown(t *testing.T) { NodeID: node.ID, Status: structs.EvalStatusPending, } - require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation err := h.Process(NewServiceScheduler, eval) - require.NoError(t, err) + must.NoError(t, err) if tc.terminal { - // No plan for terminal state allocs - require.Len(t, h.Plans, 0) + must.Len(t, 0, h.Plans, must.Sprint("expected no plan")) } else { - require.Len(t, h.Plans, 1) + must.Len(t, 1, h.Plans, must.Sprint("expected plan")) plan := h.Plans[0] out := plan.NodeUpdate[node.ID] - require.Len(t, out, 1) + must.Len(t, 1, out) outAlloc := out[0] if tc.migrate { - require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + must.NotEq(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) } else if tc.reschedule { - require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus) + must.Eq(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus) } else if tc.lost { - require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + must.Eq(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) } else { - require.Fail(t, "unexpected alloc update") + t.Fatal("unexpected alloc update") } } @@ -3643,37 +3651,33 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { ci.Parallel(t) cases := []struct { - name string - jobSpecFn func(*structs.Job) - when time.Time - rescheduled bool + name string + jobSpecFn func(*structs.Job) + previousStopWhen time.Time + expectBlockedEval bool + expectUpdate bool + expectedAllocStates int }{ // Test using stop_after_client_disconnect, remove after its deprecated in favor // of Disconnect.StopOnClientAfter introduced in 1.8.0. { - name: "legacy no stop_after_client_disconnect with reschedule", + name: "legacy no stop_after_client_disconnect", jobSpecFn: func(job *structs.Job) { job.TaskGroups[0].Count = 1 job.TaskGroups[0].StopAfterClientDisconnect = nil }, - rescheduled: true, - }, - { - name: "legacy stop_after_client_disconnect without reschedule", - jobSpecFn: func(job *structs.Job) { - job.TaskGroups[0].Count = 1 - job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second) - }, - rescheduled: false, + expectBlockedEval: true, + expectedAllocStates: 1, }, { - name: "legacy stop_after_client_disconnect with reschedule", + name: "legacy stop_after_client_disconnect reschedule now", jobSpecFn: func(job *structs.Job) { job.TaskGroups[0].Count = 1 job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second) }, - when: time.Now().UTC().Add(-10 * time.Second), - rescheduled: true, + previousStopWhen: time.Now().UTC().Add(-10 * time.Second), + expectBlockedEval: true, + expectedAllocStates: 2, }, { name: "legacy stop_after_client_disconnect reschedule later", @@ -3681,40 +3685,33 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { job.TaskGroups[0].Count = 1 job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second) }, - when: time.Now().UTC().Add(10 * time.Minute), - rescheduled: false, + expectBlockedEval: false, + expectUpdate: true, + expectedAllocStates: 1, }, // Tests using the new disconnect block { - name: "no StopOnClientAfter with reschedule", + name: "no StopOnClientAfter reschedule now", jobSpecFn: func(job *structs.Job) { job.TaskGroups[0].Count = 1 job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ StopOnClientAfter: nil, } }, - rescheduled: true, + expectBlockedEval: true, + expectedAllocStates: 1, }, { - name: "StopOnClientAfter without reschedule", + name: "StopOnClientAfter reschedule now", jobSpecFn: func(job *structs.Job) { job.TaskGroups[0].Count = 1 job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ StopOnClientAfter: pointer.Of(1 * time.Second), } }, - rescheduled: false, - }, - { - name: "StopOnClientAfter with reschedule", - jobSpecFn: func(job *structs.Job) { - job.TaskGroups[0].Count = 1 - job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ - StopOnClientAfter: pointer.Of(1 * time.Second), - } - }, - when: time.Now().UTC().Add(-10 * time.Second), - rescheduled: true, + previousStopWhen: time.Now().UTC().Add(-10 * time.Second), + expectBlockedEval: true, + expectedAllocStates: 2, }, { name: "StopOnClientAfter reschedule later", @@ -3724,8 +3721,9 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { StopOnClientAfter: pointer.Of(1 * time.Second), } }, - when: time.Now().UTC().Add(10 * time.Minute), - rescheduled: false, + expectBlockedEval: false, + expectUpdate: true, + expectedAllocStates: 1, }, } @@ -3751,22 +3749,22 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { alloc.Name = fmt.Sprintf("my-job.web[%d]", i) alloc.DesiredStatus = structs.AllocDesiredStatusRun alloc.ClientStatus = structs.AllocClientStatusRunning - if !tc.when.IsZero() { + if !tc.previousStopWhen.IsZero() { alloc.AllocStates = []*structs.AllocState{{ Field: structs.AllocStateFieldClientStatus, Value: structs.AllocClientStatusLost, - Time: tc.when, + Time: tc.previousStopWhen, }} } - allocs := []*structs.Allocation{alloc} - must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + must.NoError(t, h.State.UpsertAllocs( + structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) - // Create a mock evaluation to deal with drain + // Create a mock evaluation to deal with node going down evals := []*structs.Evaluation{{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), Priority: 50, - TriggeredBy: structs.EvalTriggerNodeDrain, + TriggeredBy: structs.EvalTriggerNodeUpdate, JobID: job.ID, NodeID: node.ID, Status: structs.EvalStatusPending, @@ -3782,60 +3780,65 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { // One followup eval created, either delayed or blocked must.Len(t, 1, h.CreateEvals) - e := h.CreateEvals[0] - must.Eq(t, eval.ID, e.PreviousEval) - - if tc.rescheduled { - must.Eq(t, "blocked", e.Status) - } else { - must.Eq(t, "pending", e.Status) - must.NotEq(t, time.Time{}, e.WaitUntil) - } + followupEval := h.CreateEvals[0] + must.Eq(t, eval.ID, followupEval.PreviousEval) - alloc, err = h.State.AllocByID(nil, alloc.ID) + // Either way, no new alloc was created + allocs, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false) must.NoError(t, err) + must.Len(t, 1, allocs) + must.Eq(t, alloc.ID, allocs[0].ID) + alloc = allocs[0] // Allocations have been transitioned to lost must.Eq(t, structs.AllocDesiredStatusStop, alloc.DesiredStatus) must.Eq(t, structs.AllocClientStatusLost, alloc.ClientStatus) - // At least 1, 2 if we manually set the tc.when - must.SliceNotEmpty(t, alloc.AllocStates) - if tc.rescheduled { - // Register a new node, leave it up, process the followup eval - node = mock.Node() - must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - must.NoError(t, h.Process(NewServiceScheduler, eval)) + // 1 if rescheduled, 2 for rescheduled later + test.Len(t, tc.expectedAllocStates, alloc.AllocStates) - as, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false) - must.NoError(t, err) - must.Len(t, 2, as) + if tc.expectBlockedEval { + must.Eq(t, structs.EvalStatusBlocked, followupEval.Status) - a2 := as[0] - if a2.ID == alloc.ID { - a2 = as[1] + } else { + must.Eq(t, structs.EvalStatusPending, followupEval.Status) + must.NotEq(t, time.Time{}, followupEval.WaitUntil) + + if tc.expectUpdate { + must.Len(t, 1, h.Plans[0].NodeUpdate[node.ID]) + must.Eq(t, structs.AllocClientStatusLost, + h.Plans[0].NodeUpdate[node.ID][0].ClientStatus) + must.MapLen(t, 0, h.Plans[0].NodeAllocation) + } else { + must.Len(t, 0, h.Plans[0].NodeUpdate[node.ID]) + must.MapLen(t, 1, h.Plans[0].NodeAllocation) } + } - must.Eq(t, structs.AllocClientStatusPending, a2.ClientStatus) - must.Eq(t, structs.AllocDesiredStatusRun, a2.DesiredStatus) - must.Eq(t, node.ID, a2.NodeID) - - // No blocked evals - must.SliceEmpty(t, h.ReblockEvals) - must.Len(t, 1, h.CreateEvals) - must.Eq(t, h.CreateEvals[0].ID, e.ID) - } else { - // No new alloc was created - as, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false) - must.NoError(t, err) + // Register a new node, leave it up, process the followup eval + node = mock.Node() + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), + []*structs.Evaluation{followupEval})) + must.NoError(t, h.Process(NewServiceScheduler, followupEval)) - must.Len(t, 1, as) - old := as[0] + allocs, err = h.State.AllocsByJob(nil, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 2, allocs) - must.Eq(t, alloc.ID, old.ID) - must.Eq(t, structs.AllocClientStatusLost, old.ClientStatus) - must.Eq(t, structs.AllocDesiredStatusStop, old.DesiredStatus) + alloc2 := allocs[0] + if alloc2.ID == alloc.ID { + alloc2 = allocs[1] } + + must.Eq(t, structs.AllocClientStatusPending, alloc2.ClientStatus) + must.Eq(t, structs.AllocDesiredStatusRun, alloc2.DesiredStatus) + must.Eq(t, node.ID, alloc2.NodeID) + + // No more follow-up evals + must.SliceEmpty(t, h.ReblockEvals) + must.Len(t, 1, h.CreateEvals) + must.Eq(t, h.CreateEvals[0].ID, followupEval.ID) }) } } @@ -5839,7 +5842,7 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) { // Register a draining node node := mock.DrainNode() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) // Create an alloc on the draining node alloc := mock.Alloc() @@ -5848,8 +5851,8 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) { alloc.Job.TaskGroups[0].Count = 1 alloc.Job.TaskGroups[0].EphemeralDisk.Sticky = true alloc.DesiredTransition.Migrate = pointer.Of(true) - require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, alloc.Job)) - require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, alloc.Job)) + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) // Create a mock evaluation to deal with drain eval := &structs.Evaluation{ @@ -5862,33 +5865,25 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) { Status: structs.EvalStatusPending, } - require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + must.NoError(t, h.Process(NewServiceScheduler, eval)) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + must.Len(t, 1, h.Plans, must.Sprint("expected plan")) plan := h.Plans[0] // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + must.Eq(t, 1, len(plan.NodeUpdate[node.ID]), + must.Sprint("expected alloc to be evicted")) // Ensure the plan didn't create any new allocations var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 0 { - t.Fatalf("bad: %#v", plan) - } + must.Eq(t, 0, len(planned)) h.AssertEvalStatus(t, structs.EvalStatusComplete) }