diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index b84a8177c45..6ecb078fa48 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2449,7 +2449,9 @@ func TestCoreScheduler_FailLoop(t *testing.T) { out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5) require.NoError(err) - require.Nil(out, - "failed core jobs should not result in follow-up. TriggeredBy: %v", - out.TriggeredBy) + if out != nil { + t.Fatalf( + "failed core jobs should not result in follow-up. TriggeredBy: %v", + out.TriggeredBy) + } } diff --git a/nomad/leader.go b/nomad/leader.go index c19b2159b26..8ad929b181e 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString()) - // Create a follow-up evaluation that will be used to retry the - // scheduling for the job after the cluster is hopefully more stable - // due to the fairly large backoff. - followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + - time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) - - followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) - updateEval.NextEval = followupEval.ID - updateEval.UpdateModifyTime() - - // Update via Raft - req := structs.EvalUpdateRequest{ - Evals: []*structs.Evaluation{updateEval, followupEval}, - } - if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) - continue + // Core job evals that fail or span leader elections will never + // succeed because the follow-up doesn't have the leader ACL. We + // rely on the leader to schedule new core jobs periodically + // instead. + if eval.Type != structs.JobTypeCore { + + // Create a follow-up evaluation that will be used to retry the + // scheduling for the job after the cluster is hopefully more stable + // due to the fairly large backoff. + followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + + time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) + + followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) + updateEval.NextEval = followupEval.ID + updateEval.UpdateModifyTime() + + // Update via Raft + req := structs.EvalUpdateRequest{ + Evals: []*structs.Evaluation{updateEval, followupEval}, + } + if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { + s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) + continue + } } - // Ack completion s.evalBroker.Ack(eval.ID, token) }