From 41b853b44d88dcacbd9717b570f902e46445149e Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 31 Aug 2021 16:51:30 -0400 Subject: [PATCH 1/6] scheduler: warn when system jobs cannot place an alloc When a system or sysbatch job specify constraints that none of the current nodes meet, report a warning to the user. Also, for sysbatch job, mark the job as dead as a result. A sample run would look like: ``` $ nomad job run ./example.nomad ==> 2021-08-31T16:57:35-04:00: Monitoring evaluation "b48e8882" 2021-08-31T16:57:35-04:00: Evaluation triggered by job "example" ==> 2021-08-31T16:57:36-04:00: Monitoring evaluation "b48e8882" 2021-08-31T16:57:36-04:00: Evaluation status changed: "pending" -> "complete" ==> 2021-08-31T16:57:36-04:00: Evaluation "b48e8882" finished with status "complete" but failed to place all allocations: 2021-08-31T16:57:36-04:00: Task Group "cache" (failed to place 1 allocation): * Constraint "${meta.tag} = bar": 2 nodes excluded by filter * Constraint "${attr.kernel.name} = linux": 1 nodes excluded by filter $ nomad job status example ID = example Name = example Submit Date = 2021-08-31T16:57:35-04:00 Type = sysbatch Priority = 50 Datacenters = dc1 Namespace = default Status = dead Periodic = false Parameterized = false Summary Task Group Queued Starting Running Failed Complete Lost cache 0 0 0 0 0 0 Allocations No allocations placed ``` --- scheduler/scheduler_sysbatch_test.go | 67 +++++++++++++++++++++++++--- scheduler/scheduler_system.go | 47 ++++++++++++++++--- 2 files changed, 102 insertions(+), 12 deletions(-) diff --git a/scheduler/scheduler_sysbatch_test.go b/scheduler/scheduler_sysbatch_test.go index a8feeb8d2c3..aba43df9858 100644 --- a/scheduler/scheduler_sysbatch_test.go +++ b/scheduler/scheduler_sysbatch_test.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/kr/pretty" "github.com/stretchr/testify/require" ) @@ -758,13 +759,17 @@ func TestSysBatch_RetryLimit(t *testing.T) { func TestSysBatch_Queued_With_Constraints(t *testing.T) { h := NewHarness(t) - // Register a node - node := mock.Node() - node.Attributes["kernel.name"] = "darwin" - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + nodes := createNodes(t, h, 3) // Generate a sysbatch job which can't be placed on the node job := mock.SystemBatchJob() + job.Constraints = []*structs.Constraint{ + { + LTarget: "${attr.kernel.name}", + RTarget: "not_existing_os", + Operand: "=", + }, + } require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) // Create a mock evaluation to deal with the node update @@ -772,9 +777,8 @@ func TestSysBatch_Queued_With_Constraints(t *testing.T) { Namespace: structs.DefaultNamespace, ID: uuid.Generate(), Priority: 50, - TriggeredBy: structs.EvalTriggerNodeUpdate, + TriggeredBy: structs.EvalTriggerJobRegister, JobID: job.ID, - NodeID: node.ID, Status: structs.EvalStatusPending, } require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) @@ -787,6 +791,57 @@ func TestSysBatch_Queued_With_Constraints(t *testing.T) { val, ok := h.Evals[0].QueuedAllocations["pinger"] require.True(t, ok) require.Zero(t, val) + + failedTGAllocs := h.Evals[0].FailedTGAllocs + pretty.Println(failedTGAllocs) + require.NotNil(t, failedTGAllocs) + require.Contains(t, failedTGAllocs, "pinger") + require.Equal(t, len(nodes), failedTGAllocs["pinger"].NodesEvaluated) + require.Equal(t, len(nodes), failedTGAllocs["pinger"].NodesFiltered) + +} + +func TestSysBatch_Queued_With_Constraints_PartialMatch(t *testing.T) { + h := NewHarness(t) + + // linux machines + linux := createNodes(t, h, 3) + for i := 0; i < 3; i++ { + node := mock.Node() + node.Attributes["kernel.name"] = "darwin" + node.ComputeClass() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // Generate a sysbatch job which can't be placed on the node + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + foundNodes := map[string]bool{} + for n := range h.Plans[0].NodeAllocation { + foundNodes[n] = true + } + expected := map[string]bool{} + for _, n := range linux { + expected[n.ID] = true + } + + require.Equal(t, expected, foundNodes) } // This test ensures that the scheduler correctly ignores ineligible diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 304e86a8bd4..296c75f4379 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -279,6 +279,24 @@ func (s *SystemScheduler) computeJobAllocs() error { return s.computePlacements(diff.place) } +func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { + if acc == nil { + v := *curr + return &v + } + + acc.NodesEvaluated += curr.NodesEvaluated + acc.NodesFiltered += curr.NodesFiltered + for k, v := range curr.ClassFiltered { + acc.ClassFiltered[k] += v + } + for k, v := range curr.ConstraintFiltered { + acc.ConstraintFiltered[k] += v + } + acc.AllocationTime += curr.AllocationTime + return acc +} + // computePlacements computes placements for allocations func (s *SystemScheduler) computePlacements(place []allocTuple) error { nodeByID := make(map[string]*structs.Node, len(s.nodes)) @@ -286,8 +304,13 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { nodeByID[node.ID] = node } + // track node filtering, to only report an error if all nodes have been filtered + var filteredMetrics map[string]*structs.AllocMetric + nodes := make([]*structs.Node, 1) for _, missing := range place { + tgName := missing.TaskGroup.Name + node, ok := nodeByID[missing.Alloc.NodeID] if !ok { s.logger.Debug("could not find node %q", missing.Alloc.NodeID) @@ -309,13 +332,25 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { // couldn't create an allocation then decrementing queued for that // task group if s.ctx.metrics.NodesFiltered > 0 { - s.queuedAllocs[missing.TaskGroup.Name] -= 1 + queued := s.queuedAllocs[tgName] - 1 + s.queuedAllocs[tgName] = queued + if filteredMetrics == nil { + filteredMetrics = map[string]*structs.AllocMetric{} + } + filteredMetrics[tgName] = mergeNodeFiltered(filteredMetrics[tgName], s.ctx.Metrics()) + + if queued <= 0 { + if s.failedTGAllocs == nil { + s.failedTGAllocs = make(map[string]*structs.AllocMetric) + } + s.failedTGAllocs[tgName] = filteredMetrics[tgName] + } // If we are annotating the plan, then decrement the desired // placements based on whether the node meets the constraints if s.eval.AnnotatePlan && s.plan.Annotations != nil && s.plan.Annotations.DesiredTGUpdates != nil { - desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] + desired := s.plan.Annotations.DesiredTGUpdates[tgName] desired.Place -= 1 } @@ -324,7 +359,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { } // Check if this task group has already failed, reported to the user as a count - if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { + if metric, ok := s.failedTGAllocs[tgName]; ok { metric.CoalescedFailures += 1 metric.ExhaustResources(missing.TaskGroup) continue @@ -345,7 +380,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { s.ctx.Metrics().ExhaustResources(missing.TaskGroup) // Actual failure to start this task on this candidate node, report it individually - s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() + s.failedTGAllocs[tgName] = s.ctx.Metrics() s.addBlocked(node) continue @@ -378,7 +413,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { EvalID: s.eval.ID, Name: missing.Name, JobID: s.job.ID, - TaskGroup: missing.TaskGroup.Name, + TaskGroup: tgName, Metrics: s.ctx.Metrics(), NodeID: option.Node.ID, NodeName: option.Node.Name, @@ -410,7 +445,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { if s.eval.AnnotatePlan && s.plan.Annotations != nil { s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub(nil)) if s.plan.Annotations.DesiredTGUpdates != nil { - desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] + desired := s.plan.Annotations.DesiredTGUpdates[tgName] desired.Preemptions += 1 } } From 044ce3c99434de0c603238824905de28c9085602 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 1 Sep 2021 19:49:04 -0400 Subject: [PATCH 2/6] tests: update expected test result based on changes done in #11111 --- scheduler/scheduler_sysbatch_test.go | 8 +++++--- scheduler/scheduler_system_test.go | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/scheduler/scheduler_sysbatch_test.go b/scheduler/scheduler_sysbatch_test.go index aba43df9858..e7c23b36a34 100644 --- a/scheduler/scheduler_sysbatch_test.go +++ b/scheduler/scheduler_sysbatch_test.go @@ -945,8 +945,8 @@ func TestSysBatch_JobConstraint_AddNode(t *testing.T) { // Add a new node Class-B var nodeBTwo *structs.Node nodeBTwo = mock.Node() - require.NoError(t, nodeBTwo.ComputeClass()) nodeBTwo.NodeClass = "Class-B" + require.NoError(t, nodeBTwo.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) // Evaluate the new node @@ -965,8 +965,10 @@ func TestSysBatch_JobConstraint_AddNode(t *testing.T) { require.Nil(t, h.Process(NewSysBatchScheduler, eval3)) require.Equal(t, "complete", h.Evals[2].Status) - // Ensure no failed TG allocs - require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + // Ensure `groupA` fails to be placed due to its constraint, but `groupB` doesn't + require.Equal(t, 1, len(h.Evals[2].FailedTGAllocs)) + require.Contains(t, h.Evals[2].FailedTGAllocs, "groupA") + require.NotContains(t, h.Evals[2].FailedTGAllocs, "groupB") require.Len(t, h.Plans, 2) require.Len(t, h.Plans[1].NodeAllocation, 1) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 03329139a98..c5c390f02e3 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -1275,8 +1275,8 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { // Add a new node Class-B var nodeBTwo *structs.Node nodeBTwo = mock.Node() - require.NoError(t, nodeBTwo.ComputeClass()) nodeBTwo.NodeClass = "Class-B" + require.NoError(t, nodeBTwo.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) // Evaluate the new node @@ -1295,8 +1295,10 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { require.Nil(t, h.Process(NewSystemScheduler, eval3)) require.Equal(t, "complete", h.Evals[2].Status) - // Ensure no failed TG allocs - require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + // Ensure `groupA` fails to be placed due to its constraint, but `groupB` doesn't + require.Equal(t, 1, len(h.Evals[2].FailedTGAllocs)) + require.Contains(t, h.Evals[2].FailedTGAllocs, "groupA") + require.NotContains(t, h.Evals[2].FailedTGAllocs, "groupB") require.Len(t, h.Plans, 2) require.Len(t, h.Plans[1].NodeAllocation, 1) From 092abb8cd450fb9865b0166ace72ca4e70b22fd4 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 2 Sep 2021 11:36:02 -0400 Subject: [PATCH 3/6] test: use Len instead of Equal on system and sysbatch node constraint tests --- scheduler/scheduler_sysbatch_test.go | 4 ++-- scheduler/scheduler_system_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scheduler/scheduler_sysbatch_test.go b/scheduler/scheduler_sysbatch_test.go index e7c23b36a34..302c5c0bd79 100644 --- a/scheduler/scheduler_sysbatch_test.go +++ b/scheduler/scheduler_sysbatch_test.go @@ -934,7 +934,7 @@ func TestSysBatch_JobConstraint_AddNode(t *testing.T) { require.Equal(t, "complete", h.Evals[1].Status) // Ensure no new plans - require.Equal(t, 1, len(h.Plans)) + require.Len(t, h.Plans, 1) // Ensure all NodeAllocations are from first Eval for _, allocs := range h.Plans[0].NodeAllocation { @@ -966,7 +966,7 @@ func TestSysBatch_JobConstraint_AddNode(t *testing.T) { require.Equal(t, "complete", h.Evals[2].Status) // Ensure `groupA` fails to be placed due to its constraint, but `groupB` doesn't - require.Equal(t, 1, len(h.Evals[2].FailedTGAllocs)) + require.Len(t, h.Evals[2].FailedTGAllocs, 1) require.Contains(t, h.Evals[2].FailedTGAllocs, "groupA") require.NotContains(t, h.Evals[2].FailedTGAllocs, "groupB") diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index c5c390f02e3..b46bbc9e444 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -1264,7 +1264,7 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { require.Equal(t, "complete", h.Evals[1].Status) // Ensure no new plans - require.Equal(t, 1, len(h.Plans)) + require.Len(t, h.Plans, 1) // Ensure all NodeAllocations are from first Eval for _, allocs := range h.Plans[0].NodeAllocation { @@ -1296,7 +1296,7 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { require.Equal(t, "complete", h.Evals[2].Status) // Ensure `groupA` fails to be placed due to its constraint, but `groupB` doesn't - require.Equal(t, 1, len(h.Evals[2].FailedTGAllocs)) + require.Len(t, h.Evals[2].FailedTGAllocs, 1) require.Contains(t, h.Evals[2].FailedTGAllocs, "groupA") require.NotContains(t, h.Evals[2].FailedTGAllocs, "groupB") From dda8cc75b911d85a199ba46608eaf57a9a346849 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 2 Sep 2021 12:13:42 -0400 Subject: [PATCH 4/6] changelog: add entry for #11111 --- .changelog/11111.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/11111.txt diff --git a/.changelog/11111.txt b/.changelog/11111.txt new file mode 100644 index 00000000000..5b08ae1c634 --- /dev/null +++ b/.changelog/11111.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: Keep track of system and sysbatch evaluations that fail to place an allocation +``` From 334a4913c8db8d332358d77ba9d645d5e5fc2a8e Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 10 Sep 2021 16:41:31 -0700 Subject: [PATCH 5/6] scheduler: deep copy AllocMetric Defensively deep copy AllocMetric to avoid side effects from shared map references. --- scheduler/scheduler_system.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 296c75f4379..2b0da38bdef 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -281,8 +281,7 @@ func (s *SystemScheduler) computeJobAllocs() error { func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { if acc == nil { - v := *curr - return &v + return curr.Copy() } acc.NodesEvaluated += curr.NodesEvaluated @@ -328,12 +327,13 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { // If the task can't be placed on this node, update reporting data // and continue to short circuit the loop - // If this node was filtered because of constraint mismatches and we - // couldn't create an allocation then decrementing queued for that - // task group + // If this node was filtered because of constraint + // mismatches and we couldn't create an allocation then + // decrement queuedAllocs for that task group. if s.ctx.metrics.NodesFiltered > 0 { queued := s.queuedAllocs[tgName] - 1 s.queuedAllocs[tgName] = queued + if filteredMetrics == nil { filteredMetrics = map[string]*structs.AllocMetric{} } From 2a142daa08867fc7947b0018efd36f00462cef95 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 10 Sep 2021 16:45:43 -0700 Subject: [PATCH 6/6] docs: focus changelog entry for #11111 on the ux While I don't think this fully encompasses the changes, other bits like marking sysbatch as dead immediately are new so haven't changed from a previous release. --- .changelog/11111.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/11111.txt b/.changelog/11111.txt index 5b08ae1c634..4149e060854 100644 --- a/.changelog/11111.txt +++ b/.changelog/11111.txt @@ -1,3 +1,3 @@ ```release-note:improvement -scheduler: Keep track of system and sysbatch evaluations that fail to place an allocation +scheduler: Warn users when system and sysbatch evaluations fail to place an allocation ```