From 5bfb500932d4ac6e08cc71837b3740a264450a19 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Wed, 22 May 2024 10:23:08 -0400
Subject: [PATCH] refactor scheduler tests for node down/disconnected (#22198)

While working on #20462 #12319 I found that some of our scheduler tests around
down nodes or disconnected clients were enforcing invariants that were
unclear. This changeset pulls out some minor refactorings so that the bug fix PR
is easier to review. This includes:

* Migrating a few tests from `testify` to `shoenig/test` that I'm going to touch
  in #12319 anyways.
* Adding test names to the node down test
* Update the disconnected client test so that we always re-process the
  pending/blocked eval it creates; this eliminates 2 redundant sub-tests.
* Update the disconnected client test assertions so that they're explicit in the
  test setup rather than implied by whether we re-process the pending/blocked
  eval.

Ref: https://github.com/hashicorp/nomad/issues/20462
Ref: https://github.com/hashicorp/nomad/pull/12319
---
 scheduler/generic_sched_test.go | 221 ++++++++++++++++----------------
 1 file changed, 108 insertions(+), 113 deletions(-)

diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 5e0ec00fb80..9d5934979f0 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -18,6 +18,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/shoenig/test"
 	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -3524,6 +3525,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	ci.Parallel(t)
 
 	cases := []struct {
+		name       string
 		desired    string
 		client     string
 		migrate    bool
@@ -3532,36 +3534,43 @@ func TestServiceSched_NodeDown(t *testing.T) {
 		lost       bool
 	}{
 		{
+			name:    "should stop is running should be lost",
 			desired: structs.AllocDesiredStatusStop,
 			client:  structs.AllocClientStatusRunning,
 			lost:    true,
 		},
 		{
+			name:    "should run is pending should be migrate",
 			desired: structs.AllocDesiredStatusRun,
 			client:  structs.AllocClientStatusPending,
 			migrate: true,
 		},
 		{
+			name:    "should run is running should be migrate",
 			desired: structs.AllocDesiredStatusRun,
 			client:  structs.AllocClientStatusRunning,
 			migrate: true,
 		},
 		{
+			name:     "should run is lost should be terminal",
 			desired:  structs.AllocDesiredStatusRun,
 			client:   structs.AllocClientStatusLost,
 			terminal: true,
 		},
 		{
+			name:     "should run is complete should be terminal",
 			desired:  structs.AllocDesiredStatusRun,
 			client:   structs.AllocClientStatusComplete,
 			terminal: true,
 		},
 		{
+			name:       "should run is failed should be rescheduled",
 			desired:    structs.AllocDesiredStatusRun,
 			client:     structs.AllocClientStatusFailed,
 			reschedule: true,
 		},
 		{
+			name:    "should evict is running should be lost",
 			desired: structs.AllocDesiredStatusEvict,
 			client:  structs.AllocClientStatusRunning,
 			lost:    true,
@@ -3569,17 +3578,17 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	}
 
 	for i, tc := range cases {
-		t.Run(fmt.Sprintf(""), func(t *testing.T) {
+		t.Run(tc.name, func(t *testing.T) {
 			h := NewHarness(t)
 
 			// Register a node
 			node := mock.Node()
 			node.Status = structs.NodeStatusDown
-			require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
+			must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
 
 			// Generate a fake job with allocations and an update policy.
 			job := mock.Job()
-			require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job))
+			must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job))
 
 			alloc := mock.Alloc()
 			alloc.Job = job
@@ -3594,7 +3603,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
 			alloc.DesiredTransition.Migrate = pointer.Of(tc.migrate)
 
 			allocs := []*structs.Allocation{alloc}
-			require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs))
+			must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs))
 
 			// Create a mock evaluation to deal with drain
 			eval := &structs.Evaluation{
@@ -3606,31 +3615,30 @@ func TestServiceSched_NodeDown(t *testing.T) {
 				NodeID:      node.ID,
 				Status:      structs.EvalStatusPending,
 			}
-			require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
+			must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
 
 			// Process the evaluation
 			err := h.Process(NewServiceScheduler, eval)
-			require.NoError(t, err)
+			must.NoError(t, err)
 
 			if tc.terminal {
-				// No plan for terminal state allocs
-				require.Len(t, h.Plans, 0)
+				must.Len(t, 0, h.Plans, must.Sprint("expected no plan"))
 			} else {
-				require.Len(t, h.Plans, 1)
+				must.Len(t, 1, h.Plans, must.Sprint("expected plan"))
 
 				plan := h.Plans[0]
 				out := plan.NodeUpdate[node.ID]
-				require.Len(t, out, 1)
+				must.Len(t, 1, out)
 
 				outAlloc := out[0]
 				if tc.migrate {
-					require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+					must.NotEq(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
 				} else if tc.reschedule {
-					require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
+					must.Eq(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
 				} else if tc.lost {
-					require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+					must.Eq(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
 				} else {
-					require.Fail(t, "unexpected alloc update")
+					t.Fatal("unexpected alloc update")
 				}
 			}
 
@@ -3643,37 +3651,33 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 	ci.Parallel(t)
 
 	cases := []struct {
-		name        string
-		jobSpecFn   func(*structs.Job)
-		when        time.Time
-		rescheduled bool
+		name                string
+		jobSpecFn           func(*structs.Job)
+		previousStopWhen    time.Time
+		expectBlockedEval   bool
+		expectUpdate        bool
+		expectedAllocStates int
 	}{
 		// Test using stop_after_client_disconnect, remove after its deprecated  in favor
 		// of Disconnect.StopOnClientAfter introduced in 1.8.0.
 		{
-			name: "legacy no stop_after_client_disconnect with reschedule",
+			name: "legacy no stop_after_client_disconnect",
 			jobSpecFn: func(job *structs.Job) {
 				job.TaskGroups[0].Count = 1
 				job.TaskGroups[0].StopAfterClientDisconnect = nil
 			},
-			rescheduled: true,
-		},
-		{
-			name: "legacy stop_after_client_disconnect without reschedule",
-			jobSpecFn: func(job *structs.Job) {
-				job.TaskGroups[0].Count = 1
-				job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second)
-			},
-			rescheduled: false,
+			expectBlockedEval:   true,
+			expectedAllocStates: 1,
 		},
 		{
-			name: "legacy stop_after_client_disconnect with reschedule",
+			name: "legacy stop_after_client_disconnect reschedule now",
 			jobSpecFn: func(job *structs.Job) {
 				job.TaskGroups[0].Count = 1
 				job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second)
 			},
-			when:        time.Now().UTC().Add(-10 * time.Second),
-			rescheduled: true,
+			previousStopWhen:    time.Now().UTC().Add(-10 * time.Second),
+			expectBlockedEval:   true,
+			expectedAllocStates: 2,
 		},
 		{
 			name: "legacy stop_after_client_disconnect reschedule later",
@@ -3681,40 +3685,33 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 				job.TaskGroups[0].Count = 1
 				job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second)
 			},
-			when:        time.Now().UTC().Add(10 * time.Minute),
-			rescheduled: false,
+			expectBlockedEval:   false,
+			expectUpdate:        true,
+			expectedAllocStates: 1,
 		},
 		// Tests using the new disconnect block
 		{
-			name: "no StopOnClientAfter with reschedule",
+			name: "no StopOnClientAfter reschedule now",
 			jobSpecFn: func(job *structs.Job) {
 				job.TaskGroups[0].Count = 1
 				job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{
 					StopOnClientAfter: nil,
 				}
 			},
-			rescheduled: true,
+			expectBlockedEval:   true,
+			expectedAllocStates: 1,
 		},
 		{
-			name: "StopOnClientAfter without reschedule",
+			name: "StopOnClientAfter reschedule now",
 			jobSpecFn: func(job *structs.Job) {
 				job.TaskGroups[0].Count = 1
 				job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{
 					StopOnClientAfter: pointer.Of(1 * time.Second),
 				}
 			},
-			rescheduled: false,
-		},
-		{
-			name: "StopOnClientAfter with reschedule",
-			jobSpecFn: func(job *structs.Job) {
-				job.TaskGroups[0].Count = 1
-				job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{
-					StopOnClientAfter: pointer.Of(1 * time.Second),
-				}
-			},
-			when:        time.Now().UTC().Add(-10 * time.Second),
-			rescheduled: true,
+			previousStopWhen:    time.Now().UTC().Add(-10 * time.Second),
+			expectBlockedEval:   true,
+			expectedAllocStates: 2,
 		},
 		{
 			name: "StopOnClientAfter reschedule later",
@@ -3724,8 +3721,9 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 					StopOnClientAfter: pointer.Of(1 * time.Second),
 				}
 			},
-			when:        time.Now().UTC().Add(10 * time.Minute),
-			rescheduled: false,
+			expectBlockedEval:   false,
+			expectUpdate:        true,
+			expectedAllocStates: 1,
 		},
 	}
 
@@ -3751,22 +3749,22 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 			alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
 			alloc.DesiredStatus = structs.AllocDesiredStatusRun
 			alloc.ClientStatus = structs.AllocClientStatusRunning
-			if !tc.when.IsZero() {
+			if !tc.previousStopWhen.IsZero() {
 				alloc.AllocStates = []*structs.AllocState{{
 					Field: structs.AllocStateFieldClientStatus,
 					Value: structs.AllocClientStatusLost,
-					Time:  tc.when,
+					Time:  tc.previousStopWhen,
 				}}
 			}
-			allocs := []*structs.Allocation{alloc}
-			must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs))
+			must.NoError(t, h.State.UpsertAllocs(
+				structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc}))
 
-			// Create a mock evaluation to deal with drain
+			// Create a mock evaluation to deal with node going down
 			evals := []*structs.Evaluation{{
 				Namespace:   structs.DefaultNamespace,
 				ID:          uuid.Generate(),
 				Priority:    50,
-				TriggeredBy: structs.EvalTriggerNodeDrain,
+				TriggeredBy: structs.EvalTriggerNodeUpdate,
 				JobID:       job.ID,
 				NodeID:      node.ID,
 				Status:      structs.EvalStatusPending,
@@ -3782,60 +3780,65 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
 
 			// One followup eval created, either delayed or blocked
 			must.Len(t, 1, h.CreateEvals)
-			e := h.CreateEvals[0]
-			must.Eq(t, eval.ID, e.PreviousEval)
-
-			if tc.rescheduled {
-				must.Eq(t, "blocked", e.Status)
-			} else {
-				must.Eq(t, "pending", e.Status)
-				must.NotEq(t, time.Time{}, e.WaitUntil)
-			}
+			followupEval := h.CreateEvals[0]
+			must.Eq(t, eval.ID, followupEval.PreviousEval)
 
-			alloc, err = h.State.AllocByID(nil, alloc.ID)
+			// Either way, no new alloc was created
+			allocs, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false)
 			must.NoError(t, err)
+			must.Len(t, 1, allocs)
+			must.Eq(t, alloc.ID, allocs[0].ID)
+			alloc = allocs[0]
 
 			// Allocations have been transitioned to lost
 			must.Eq(t, structs.AllocDesiredStatusStop, alloc.DesiredStatus)
 			must.Eq(t, structs.AllocClientStatusLost, alloc.ClientStatus)
-			// At least 1, 2 if we manually set the tc.when
-			must.SliceNotEmpty(t, alloc.AllocStates)
 
-			if tc.rescheduled {
-				// Register a new node, leave it up, process the followup eval
-				node = mock.Node()
-				must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
-				must.NoError(t, h.Process(NewServiceScheduler, eval))
+			// 1 if rescheduled, 2 for rescheduled later
+			test.Len(t, tc.expectedAllocStates, alloc.AllocStates)
 
-				as, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false)
-				must.NoError(t, err)
-				must.Len(t, 2, as)
+			if tc.expectBlockedEval {
+				must.Eq(t, structs.EvalStatusBlocked, followupEval.Status)
 
-				a2 := as[0]
-				if a2.ID == alloc.ID {
-					a2 = as[1]
+			} else {
+				must.Eq(t, structs.EvalStatusPending, followupEval.Status)
+				must.NotEq(t, time.Time{}, followupEval.WaitUntil)
+
+				if tc.expectUpdate {
+					must.Len(t, 1, h.Plans[0].NodeUpdate[node.ID])
+					must.Eq(t, structs.AllocClientStatusLost,
+						h.Plans[0].NodeUpdate[node.ID][0].ClientStatus)
+					must.MapLen(t, 0, h.Plans[0].NodeAllocation)
+				} else {
+					must.Len(t, 0, h.Plans[0].NodeUpdate[node.ID])
+					must.MapLen(t, 1, h.Plans[0].NodeAllocation)
 				}
+			}
 
-				must.Eq(t, structs.AllocClientStatusPending, a2.ClientStatus)
-				must.Eq(t, structs.AllocDesiredStatusRun, a2.DesiredStatus)
-				must.Eq(t, node.ID, a2.NodeID)
-
-				// No blocked evals
-				must.SliceEmpty(t, h.ReblockEvals)
-				must.Len(t, 1, h.CreateEvals)
-				must.Eq(t, h.CreateEvals[0].ID, e.ID)
-			} else {
-				// No new alloc was created
-				as, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false)
-				must.NoError(t, err)
+			// Register a new node, leave it up, process the followup eval
+			node = mock.Node()
+			must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
+			must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(),
+				[]*structs.Evaluation{followupEval}))
+			must.NoError(t, h.Process(NewServiceScheduler, followupEval))
 
-				must.Len(t, 1, as)
-				old := as[0]
+			allocs, err = h.State.AllocsByJob(nil, job.Namespace, job.ID, false)
+			must.NoError(t, err)
+			must.Len(t, 2, allocs)
 
-				must.Eq(t, alloc.ID, old.ID)
-				must.Eq(t, structs.AllocClientStatusLost, old.ClientStatus)
-				must.Eq(t, structs.AllocDesiredStatusStop, old.DesiredStatus)
+			alloc2 := allocs[0]
+			if alloc2.ID == alloc.ID {
+				alloc2 = allocs[1]
 			}
+
+			must.Eq(t, structs.AllocClientStatusPending, alloc2.ClientStatus)
+			must.Eq(t, structs.AllocDesiredStatusRun, alloc2.DesiredStatus)
+			must.Eq(t, node.ID, alloc2.NodeID)
+
+			// No more follow-up evals
+			must.SliceEmpty(t, h.ReblockEvals)
+			must.Len(t, 1, h.CreateEvals)
+			must.Eq(t, h.CreateEvals[0].ID, followupEval.ID)
 		})
 	}
 }
@@ -5839,7 +5842,7 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) {
 
 	// Register a draining node
 	node := mock.DrainNode()
-	require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
+	must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
 
 	// Create an alloc on the draining node
 	alloc := mock.Alloc()
@@ -5848,8 +5851,8 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) {
 	alloc.Job.TaskGroups[0].Count = 1
 	alloc.Job.TaskGroups[0].EphemeralDisk.Sticky = true
 	alloc.DesiredTransition.Migrate = pointer.Of(true)
-	require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, alloc.Job))
-	require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc}))
+	must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, alloc.Job))
+	must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc}))
 
 	// Create a mock evaluation to deal with drain
 	eval := &structs.Evaluation{
@@ -5862,33 +5865,25 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) {
 		Status:      structs.EvalStatusPending,
 	}
 
-	require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
+	must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval}))
 
 	// Process the evaluation
-	err := h.Process(NewServiceScheduler, eval)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	must.NoError(t, h.Process(NewServiceScheduler, eval))
 
 	// Ensure a single plan
-	if len(h.Plans) != 1 {
-		t.Fatalf("bad: %#v", h.Plans)
-	}
+	must.Len(t, 1, h.Plans, must.Sprint("expected plan"))
 	plan := h.Plans[0]
 
 	// Ensure the plan evicted all allocs
-	if len(plan.NodeUpdate[node.ID]) != 1 {
-		t.Fatalf("bad: %#v", plan)
-	}
+	must.Eq(t, 1, len(plan.NodeUpdate[node.ID]),
+		must.Sprint("expected alloc to be evicted"))
 
 	// Ensure the plan didn't create any new allocations
 	var planned []*structs.Allocation
 	for _, allocList := range plan.NodeAllocation {
 		planned = append(planned, allocList...)
 	}
-	if len(planned) != 0 {
-		t.Fatalf("bad: %#v", plan)
-	}
+	must.Eq(t, 0, len(planned))
 
 	h.AssertEvalStatus(t, structs.EvalStatusComplete)
 }