From aef9bfeb12b6fd79bcd474b64704b337e566b3b8 Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Wed, 18 Aug 2021 09:50:37 -0400
Subject: [PATCH] Consider all system jobs for a new node (#11054)

When a node becomes ready, create an eval for all system jobs across
namespaces.

The previous code uses `job.ID` to deduplicate evals, but that ignores
the job namespace. Thus if there are multiple jobs in different
namespaces sharing the same ID/Name, only one will be considered for
running in the new node. Thus, Nomad may skip running some system jobs
in that node.
---
 .changelog/11054.txt        |  3 ++
 nomad/node_endpoint.go      | 10 +++---
 nomad/node_endpoint_test.go | 61 +++++++++++++++++++++++++++++++++++++
 nomad/structs/structs.go    |  4 +--
 scheduler/rank.go           |  4 +--
 5 files changed, 73 insertions(+), 9 deletions(-)
 create mode 100644 .changelog/11054.txt

diff --git a/.changelog/11054.txt b/.changelog/11054.txt
new file mode 100644
index 00000000000..c949fc34239
--- /dev/null
+++ b/.changelog/11054.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+core: Fixed a bug where system jobs with non-unique IDs may not be placed on new nodes
+```
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 36a18a26f64..7775fcef29d 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -1352,15 +1352,15 @@ func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint6
 	// Create an eval for each JobID affected
 	var evals []*structs.Evaluation
 	var evalIDs []string
-	jobIDs := make(map[string]struct{})
+	jobIDs := map[structs.NamespacedID]struct{}{}
 	now := time.Now().UTC().UnixNano()
 
 	for _, alloc := range allocs {
 		// Deduplicate on JobID
-		if _, ok := jobIDs[alloc.JobID]; ok {
+		if _, ok := jobIDs[alloc.JobNamespacedID()]; ok {
 			continue
 		}
-		jobIDs[alloc.JobID] = struct{}{}
+		jobIDs[alloc.JobNamespacedID()] = struct{}{}
 
 		// Create a new eval
 		eval := &structs.Evaluation{
@@ -1383,10 +1383,10 @@ func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint6
 	// Create an evaluation for each system job.
 	for _, job := range sysJobs {
 		// Still dedup on JobID as the node may already have the system job.
-		if _, ok := jobIDs[job.ID]; ok {
+		if _, ok := jobIDs[job.NamespacedID()]; ok {
 			continue
 		}
-		jobIDs[job.ID] = struct{}{}
+		jobIDs[job.NamespacedID()] = struct{}{}
 
 		// Create a new eval
 		eval := &structs.Evaluation{
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index fbc82d8b7a0..36969ddd2a0 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -2492,6 +2492,67 @@ func TestClientEndpoint_CreateNodeEvals(t *testing.T) {
 	}
 }
 
+// TestClientEndpoint_CreateNodeEvals_MultipleNSes asserts that evals are made
+// for all jobs across namespaces
+func TestClientEndpoint_CreateNodeEvals_MultipleNSes(t *testing.T) {
+	t.Parallel()
+
+	s1, cleanupS1 := TestServer(t, nil)
+	defer cleanupS1()
+	testutil.WaitForLeader(t, s1.RPC)
+
+	state := s1.fsm.State()
+
+	idx := uint64(3)
+	ns1 := mock.Namespace()
+	err := state.UpsertNamespaces(idx, []*structs.Namespace{ns1})
+	require.NoError(t, err)
+	idx++
+
+	node := mock.Node()
+	err = state.UpsertNode(structs.MsgTypeTestSetup, idx, node)
+	require.NoError(t, err)
+	idx++
+
+	// Inject a fake system job.
+	defaultJob := mock.SystemJob()
+	err = state.UpsertJob(structs.MsgTypeTestSetup, idx, defaultJob)
+	require.NoError(t, err)
+	idx++
+
+	nsJob := mock.SystemJob()
+	nsJob.ID = defaultJob.ID
+	nsJob.Namespace = ns1.Name
+	err = state.UpsertJob(structs.MsgTypeTestSetup, idx, nsJob)
+	require.NoError(t, err)
+	idx++
+
+	// Create some evaluations
+	evalIDs, index, err := s1.staticEndpoints.Node.createNodeEvals(node.ID, 1)
+	require.NoError(t, err)
+	require.NotZero(t, index)
+	require.Len(t, evalIDs, 2)
+
+	byNS := map[string]*structs.Evaluation{}
+	for _, evalID := range evalIDs {
+		eval, err := state.EvalByID(nil, evalID)
+		require.NoError(t, err)
+		byNS[eval.Namespace] = eval
+	}
+
+	require.Len(t, byNS, 2)
+
+	defaultNSEval := byNS[defaultJob.Namespace]
+	require.NotNil(t, defaultNSEval)
+	require.Equal(t, defaultJob.ID, defaultNSEval.JobID)
+	require.Equal(t, defaultJob.Namespace, defaultNSEval.Namespace)
+
+	otherNSEval := byNS[nsJob.Namespace]
+	require.NotNil(t, otherNSEval)
+	require.Equal(t, nsJob.ID, otherNSEval.JobID)
+	require.Equal(t, nsJob.Namespace, otherNSEval.Namespace)
+}
+
 func TestClientEndpoint_Evaluate(t *testing.T) {
 	t.Parallel()
 
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 3cff6ffc5f9..240c524e3ef 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -3986,8 +3986,8 @@ type Job struct {
 }
 
 // NamespacedID returns the namespaced id useful for logging
-func (j *Job) NamespacedID() *NamespacedID {
-	return &NamespacedID{
+func (j *Job) NamespacedID() NamespacedID {
+	return NamespacedID{
 		ID:        j.ID,
 		Namespace: j.Namespace,
 	}
diff --git a/scheduler/rank.go b/scheduler/rank.go
index efe586efaaa..799a29c4692 100644
--- a/scheduler/rank.go
+++ b/scheduler/rank.go
@@ -151,7 +151,7 @@ type BinPackIterator struct {
 	source    RankIterator
 	evict     bool
 	priority  int
-	jobId     *structs.NamespacedID
+	jobId     structs.NamespacedID
 	taskGroup *structs.TaskGroup
 	scoreFit  func(*structs.Node, *structs.ComparableResources) float64
 }
@@ -228,7 +228,7 @@ OUTER:
 		var allocsToPreempt []*structs.Allocation
 
 		// Initialize preemptor with node
-		preemptor := NewPreemptor(iter.priority, iter.ctx, iter.jobId)
+		preemptor := NewPreemptor(iter.priority, iter.ctx, &iter.jobId)
 		preemptor.SetNode(option.Node)
 
 		// Count the number of existing preemptions