From 264932dae474f418ee2e9300d9ea822e859f2dfc Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Tue, 21 Jan 2020 14:42:39 -0500 Subject: [PATCH 1/2] Return FailedTGAlloc metric instead of no node err If an existing system allocation is running and the node its running on is marked as ineligible, subsequent plan/applys return an RPC error instead of a more helpful plan result. This change logs the error, and appends a failedTGAlloc for the placement. --- scheduler/system_sched.go | 9 ++++- scheduler/system_sched_test.go | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index ec661efbb17..ed8cab317e1 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -275,7 +275,13 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { for _, missing := range place { node, ok := nodeByID[missing.Alloc.NodeID] if !ok { - return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) + s.logger.Debug("could not find node %q", missing.Alloc.NodeID) + if s.failedTGAllocs == nil { + s.failedTGAllocs = make(map[string]*structs.AllocMetric) + } + + s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() + continue } // Update the set of placement nodes @@ -327,6 +333,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { // Actual failure to start this task on this candidate node, report it individually s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() s.addBlocked(node) + continue } diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go index af0015730a2..6f4ceea609f 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/system_sched_test.go @@ -1310,6 +1310,68 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { } +// No errors reported when no available nodes prevent placement +func TestSystemSched_NoNodes(t *testing.T) { + h := NewHarness(t) + + var node *structs.Node + // Create a node + node = mock.Node() + node.ComputeClass() + require.Nil(t, h.State.UpsertNode(h.NextIndex(), node)) + + // Make a job + job := mock.SystemJob() + require.Nil(t, h.State.UpsertJob(h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + require.Nil(t, h.Process(NewSystemScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["web"] + require.True(t, ok) + require.Equal(t, 0, val) + + // The plan has one NodeAllocations + require.Equal(t, 1, len(h.Plans)) + + // Mark the node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Create a new job version, deploy + job2 := job.Copy() + job2.Meta["version"] = "2" + require.Nil(t, h.State.UpsertJob(h.NextIndex(), job2)) + + eval2 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job2.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job2.ID, + Status: structs.EvalStatusPending, + } + + // Ensure New eval is complete + require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval2})) + require.Nil(t, h.Process(NewSystemScheduler, eval2)) + require.Equal(t, "complete", h.Evals[1].Status) + + // Ensure there is a FailedTGAlloc metric + require.Equal(t, 1, len(h.Evals[1].FailedTGAllocs)) +} + // No errors reported when constraints prevent placement func TestSystemSched_ConstraintErrors(t *testing.T) { h := NewHarness(t) From abde9f9e3d6a52c4b5119536bb2275c0c631eac7 Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Wed, 22 Jan 2020 10:10:15 -0500 Subject: [PATCH 2/2] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c3d0c3975a..38d94ca3d1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ BUG FIXES: * consul: Fixed a bug where script-based health checks would fail if the service configuration included interpolation. [[GH-6916](https://github.com/hashicorp/nomad/issues/6916)] * consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)] * scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)] + * scheduler: Fixed a bug where `nomad job plan/apply` returned errors instead of a partial placement warning for ineligible nodes. [[GH-6968](https://github.com/hashicorp/nomad/issues/6968)] ## 0.10.2 (December 4, 2019)