From 5895555d66cd258f0e5ebf5505b4c989b9768d64 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 17 Aug 2020 16:04:10 -0400 Subject: [PATCH] failed core jobs should not have follow-ups If a core job fails more than the delivery limit, the leader will create a new eval with the TriggeredBy field set to `failed-follow-up`. Evaluations for core jobs have the leader's ACL, which is not valid on another leader after an election. The `failed-follow-up` evals do not have ACLs, so core job evals that fail more than the delivery limit or core job evals that span leader elections will never succeed and will be re-enqueued forever. So we should not retry with a `failed-follow-up`. --- nomad/core_sched_test.go | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index a075b6377ce..b84a8177c45 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2394,3 +2394,62 @@ func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) { }, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly") } + +func TestCoreScheduler_FailLoop(t *testing.T) { + t.Parallel() + require := require.New(t) + + srv, cleanupSrv := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 // Prevent automatic dequeue + c.EvalDeliveryLimit = 2 + c.EvalFailedFollowupBaselineDelay = time.Duration(50 * time.Millisecond) + c.EvalFailedFollowupDelayRange = time.Duration(1 * time.Millisecond) + }) + defer cleanupSrv() + codec := rpcClient(t, srv) + sched := []string{structs.JobTypeCore} + + testutil.WaitForResult(func() (bool, error) { + return srv.evalBroker.Enabled(), nil + }, func(err error) { + t.Fatalf("should enable eval broker") + }) + + // Enqueue a core job eval that can never succeed because it was enqueued + // by another leader that's now gone + expected := srv.coreJobEval(structs.CoreJobCSIPluginGC, 100) + expected.LeaderACL = "nonsense" + srv.evalBroker.Enqueue(expected) + + nack := func(evalID, token string) error { + req := &structs.EvalAckRequest{ + EvalID: evalID, + Token: token, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var resp structs.GenericResponse + return msgpackrpc.CallWithCodec(codec, "Eval.Nack", req, &resp) + } + + out, token, err := srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + require.NotNil(out) + require.Equal(expected, out) + + // first fail + require.NoError(nack(out.ID, token)) + + out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + require.NotNil(out) + require.Equal(expected, out) + + // second fail, should not result in failed-follow-up + require.NoError(nack(out.ID, token)) + + out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + require.Nil(out, + "failed core jobs should not result in follow-up. TriggeredBy: %v", + out.TriggeredBy) +}