Skip to content

Commit

Permalink
failed core jobs should not have follow-ups
Browse files Browse the repository at this point in the history
If a core job fails more than the delivery limit, the leader will create a new
eval with the TriggeredBy field set to `failed-follow-up`.

Evaluations for core jobs have the leader's ACL, which is not valid on another
leader after an election. The `failed-follow-up` evals do not have ACLs, so
core job evals that fail more than the delivery limit or core job evals that
span leader elections will never succeed and will be re-enqueued forever. So
we should not retry with a `failed-follow-up`.
  • Loading branch information
tgross committed Aug 17, 2020
1 parent 42de704 commit 5895555
Showing 1 changed file with 59 additions and 0 deletions.
59 changes: 59 additions & 0 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2394,3 +2394,62 @@ func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) {
}, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly")

}

func TestCoreScheduler_FailLoop(t *testing.T) {
t.Parallel()
require := require.New(t)

srv, cleanupSrv := TestServer(t, func(c *Config) {
c.NumSchedulers = 0 // Prevent automatic dequeue
c.EvalDeliveryLimit = 2
c.EvalFailedFollowupBaselineDelay = time.Duration(50 * time.Millisecond)
c.EvalFailedFollowupDelayRange = time.Duration(1 * time.Millisecond)
})
defer cleanupSrv()
codec := rpcClient(t, srv)
sched := []string{structs.JobTypeCore}

testutil.WaitForResult(func() (bool, error) {
return srv.evalBroker.Enabled(), nil
}, func(err error) {
t.Fatalf("should enable eval broker")
})

// Enqueue a core job eval that can never succeed because it was enqueued
// by another leader that's now gone
expected := srv.coreJobEval(structs.CoreJobCSIPluginGC, 100)
expected.LeaderACL = "nonsense"
srv.evalBroker.Enqueue(expected)

nack := func(evalID, token string) error {
req := &structs.EvalAckRequest{
EvalID: evalID,
Token: token,
WriteRequest: structs.WriteRequest{Region: "global"},
}
var resp structs.GenericResponse
return msgpackrpc.CallWithCodec(codec, "Eval.Nack", req, &resp)
}

out, token, err := srv.evalBroker.Dequeue(sched, time.Second*5)
require.NoError(err)
require.NotNil(out)
require.Equal(expected, out)

// first fail
require.NoError(nack(out.ID, token))

out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
require.NoError(err)
require.NotNil(out)
require.Equal(expected, out)

// second fail, should not result in failed-follow-up
require.NoError(nack(out.ID, token))

out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
require.NoError(err)
require.Nil(out,
"failed core jobs should not result in follow-up. TriggeredBy: %v",
out.TriggeredBy)
}

0 comments on commit 5895555

Please sign in to comment.