From 108a42283b9c3906d9b50a6e83f1dd0c5e312e72 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 18 Aug 2020 16:48:43 -0400
Subject: [PATCH] failed core jobs should not have follow-ups (#8682)

If a core job fails more than the delivery limit, the leader will create a new
eval with the TriggeredBy field set to `failed-follow-up`.

Evaluations for core jobs have the leader's ACL, which is not valid on another
leader after an election. The `failed-follow-up` evals do not have ACLs, so
core job evals that fail more than the delivery limit or core job evals that
span leader elections will never succeed and will be re-enqueued forever. So
we should not retry with a `failed-follow-up`.
---
 nomad/core_sched_test.go | 61 ++++++++++++++++++++++++++++++++++++++++
 nomad/leader.go          | 42 +++++++++++++++------------
 2 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index a075b6377ce..6ecb078fa48 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -2394,3 +2394,64 @@ func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) {
 	}, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly")
 
 }
+
+func TestCoreScheduler_FailLoop(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	srv, cleanupSrv := TestServer(t, func(c *Config) {
+		c.NumSchedulers = 0 // Prevent automatic dequeue
+		c.EvalDeliveryLimit = 2
+		c.EvalFailedFollowupBaselineDelay = time.Duration(50 * time.Millisecond)
+		c.EvalFailedFollowupDelayRange = time.Duration(1 * time.Millisecond)
+	})
+	defer cleanupSrv()
+	codec := rpcClient(t, srv)
+	sched := []string{structs.JobTypeCore}
+
+	testutil.WaitForResult(func() (bool, error) {
+		return srv.evalBroker.Enabled(), nil
+	}, func(err error) {
+		t.Fatalf("should enable eval broker")
+	})
+
+	// Enqueue a core job eval that can never succeed because it was enqueued
+	// by another leader that's now gone
+	expected := srv.coreJobEval(structs.CoreJobCSIPluginGC, 100)
+	expected.LeaderACL = "nonsense"
+	srv.evalBroker.Enqueue(expected)
+
+	nack := func(evalID, token string) error {
+		req := &structs.EvalAckRequest{
+			EvalID:       evalID,
+			Token:        token,
+			WriteRequest: structs.WriteRequest{Region: "global"},
+		}
+		var resp structs.GenericResponse
+		return msgpackrpc.CallWithCodec(codec, "Eval.Nack", req, &resp)
+	}
+
+	out, token, err := srv.evalBroker.Dequeue(sched, time.Second*5)
+	require.NoError(err)
+	require.NotNil(out)
+	require.Equal(expected, out)
+
+	// first fail
+	require.NoError(nack(out.ID, token))
+
+	out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
+	require.NoError(err)
+	require.NotNil(out)
+	require.Equal(expected, out)
+
+	// second fail, should not result in failed-follow-up
+	require.NoError(nack(out.ID, token))
+
+	out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
+	require.NoError(err)
+	if out != nil {
+		t.Fatalf(
+			"failed core jobs should not result in follow-up. TriggeredBy: %v",
+			out.TriggeredBy)
+	}
+}
diff --git a/nomad/leader.go b/nomad/leader.go
index c19b2159b26..8ad929b181e 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
 			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
 			s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())
 
-			// Create a follow-up evaluation that will be used to retry the
-			// scheduling for the job after the cluster is hopefully more stable
-			// due to the fairly large backoff.
-			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
-				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
-
-			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
-			updateEval.NextEval = followupEval.ID
-			updateEval.UpdateModifyTime()
-
-			// Update via Raft
-			req := structs.EvalUpdateRequest{
-				Evals: []*structs.Evaluation{updateEval, followupEval},
-			}
-			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
-				s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
-				continue
+			// Core job evals that fail or span leader elections will never
+			// succeed because the follow-up doesn't have the leader ACL. We
+			// rely on the leader to schedule new core jobs periodically
+			// instead.
+			if eval.Type != structs.JobTypeCore {
+
+				// Create a follow-up evaluation that will be used to retry the
+				// scheduling for the job after the cluster is hopefully more stable
+				// due to the fairly large backoff.
+				followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
+					time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
+
+				followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
+				updateEval.NextEval = followupEval.ID
+				updateEval.UpdateModifyTime()
+
+				// Update via Raft
+				req := structs.EvalUpdateRequest{
+					Evals: []*structs.Evaluation{updateEval, followupEval},
+				}
+				if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
+					s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
+					continue
+				}
 			}
-
 			// Ack completion
 			s.evalBroker.Ack(eval.ID, token)
 		}