Skip to content

Commit

Permalink
Return FailedTGAlloc metric instead of no node err
Browse files Browse the repository at this point in the history
If an existing system allocation is running and the node its running on
is marked as ineligible, subsequent plan/applys return an RPC error
instead of a more helpful plan result.

This change logs the error, and appends a failedTGAlloc for the
placement.
  • Loading branch information
drewbailey committed Jan 21, 2020
1 parent 6c3a29a commit 19f8302
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 1 deletion.
9 changes: 8 additions & 1 deletion scheduler/system_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,13 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
for _, missing := range place {
node, ok := nodeByID[missing.Alloc.NodeID]
if !ok {
return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
s.logger.Debug("could not find node %q", missing.Alloc.NodeID)
if s.failedTGAllocs == nil {
s.failedTGAllocs = make(map[string]*structs.AllocMetric)
}

s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
continue
}

// Update the set of placement nodes
Expand Down Expand Up @@ -327,6 +333,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
// Actual failure to start this task on this candidate node, report it individually
s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
s.addBlocked(node)

continue
}

Expand Down
62 changes: 62 additions & 0 deletions scheduler/system_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,68 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) {

}

// No errors reported when no available nodes prevent placement
func TestSystemSched_NoNodes(t *testing.T) {
h := NewHarness(t)

var node *structs.Node
// Create a node
node = mock.Node()
node.ComputeClass()
require.Nil(t, h.State.UpsertNode(h.NextIndex(), node))

// Make a job
job := mock.SystemJob()
require.Nil(t, h.State.UpsertJob(h.NextIndex(), job))

// Evaluate the job
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
Status: structs.EvalStatusPending,
}

require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
require.Nil(t, h.Process(NewSystemScheduler, eval))
require.Equal(t, "complete", h.Evals[0].Status)

// QueuedAllocations is drained
val, ok := h.Evals[0].QueuedAllocations["web"]
require.True(t, ok)
require.Equal(t, 0, val)

// The plan has one NodeAllocations
require.Equal(t, 1, len(h.Plans))

// Mark the node as ineligible
node.SchedulingEligibility = structs.NodeSchedulingIneligible

// Create a new job version, deploy
job2 := job.Copy()
job2.Meta["version"] = "2"
require.Nil(t, h.State.UpsertJob(h.NextIndex(), job2))

eval2 := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: job2.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job2.ID,
Status: structs.EvalStatusPending,
}

// Ensure New eval is complete
require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval2}))
require.Nil(t, h.Process(NewSystemScheduler, eval2))
require.Equal(t, "complete", h.Evals[1].Status)

// Ensure there is a FailedTGAlloc metric
require.Equal(t, 1, len(h.Evals[1].FailedTGAllocs))
}

// No errors reported when constraints prevent placement
func TestSystemSched_ConstraintErrors(t *testing.T) {
h := NewHarness(t)
Expand Down

0 comments on commit 19f8302

Please sign in to comment.