From 3b056bae1c9d4dd053aec39b118085d1ee11c40e Mon Sep 17 00:00:00 2001 From: Andrew Cholakian Date: Mon, 2 Oct 2023 12:04:30 -0500 Subject: [PATCH] [Heartbeat] Only retry down monitors on first down check (#36719) * [Heartbeat] Only retry down monitors on first down check When monitors go down we don't want to keep using retries when it was previously down, we only want to retry when it goes down the first time per the retries spec. * Update failing test --- .../monitors/wrappers/summarizer/plugstatestat.go | 11 +++++++++-- .../monitors/wrappers/summarizer/summarizer_test.go | 6 +++--- heartbeat/monitors/wrappers/wrappers_test.go | 10 ++++------ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/heartbeat/monitors/wrappers/summarizer/plugstatestat.go b/heartbeat/monitors/wrappers/summarizer/plugstatestat.go index 62a1c8ee5b85..9567a666ad6d 100644 --- a/heartbeat/monitors/wrappers/summarizer/plugstatestat.go +++ b/heartbeat/monitors/wrappers/summarizer/plugstatestat.go @@ -149,8 +149,15 @@ func (ssp *commonSSP) BeforeSummary(event *beat.Event) BeforeSummaryActions { // determine if a retry is needed lastStatus := ssp.stateTracker.GetCurrentStatus(ssp.sf) - // FinalAttempt is true if no retries will occur - retry := ssp.js.Status == monitorstate.StatusDown && ssp.js.Attempt < ssp.js.MaxAttempts + curCheckDown := ssp.js.Status == monitorstate.StatusDown + lastStateUp := ssp.stateTracker.GetCurrentStatus(ssp.sf) == monitorstate.StatusUp + hasAttemptsRemaining := ssp.js.Attempt < ssp.js.MaxAttempts + + // retry if... + retry := curCheckDown && // the current check is down + lastStateUp && // we were previously up, if we were previously down we just check once + hasAttemptsRemaining // and we are configured to actually make multiple attempts + // if we aren't retrying this is the final attempt ssp.js.FinalAttempt = !retry ms := ssp.stateTracker.RecordStatus(ssp.sf, ssp.js.Status, ssp.js.FinalAttempt) diff --git a/heartbeat/monitors/wrappers/summarizer/summarizer_test.go b/heartbeat/monitors/wrappers/summarizer/summarizer_test.go index 64472eb1c9ad..63de2da71a81 100644 --- a/heartbeat/monitors/wrappers/summarizer/summarizer_test.go +++ b/heartbeat/monitors/wrappers/summarizer/summarizer_test.go @@ -62,7 +62,7 @@ func TestSummarizer(t *testing.T) { 2, "du", "du", - "12", + "11", 2, testURL, }, @@ -80,7 +80,7 @@ func TestSummarizer(t *testing.T) { 2, "dddddddd", "dddddddd", - "12121212", + "11111111", 8, testURL, }, @@ -89,7 +89,7 @@ func TestSummarizer(t *testing.T) { 2, "udddduuu", "uuddduuu", - "11212111", + "11211111", 8, testURL, }, diff --git a/heartbeat/monitors/wrappers/wrappers_test.go b/heartbeat/monitors/wrappers/wrappers_test.go index ffdb161e62f0..ec3111b176e6 100644 --- a/heartbeat/monitors/wrappers/wrappers_test.go +++ b/heartbeat/monitors/wrappers/wrappers_test.go @@ -351,6 +351,10 @@ func TestMultiJobConts(t *testing.T) { }) } +// TestRetryMultiCont is of somewhat dubious utility at the moment, +// it mostly tests that we __don't__ retry on an initial down. +// retry logic is better and more completely tested in the summarizer +// and scenario tests. func TestRetryMultiCont(t *testing.T) { uniqScope := isdef.ScopedIsUnique() @@ -466,12 +470,6 @@ func TestRetryMultiCont(t *testing.T) { summarizertesthelper.SummaryValidator(expected.js.Up, expected.js.Down), hbtestllext.MaybeHasDuration, ), - contJobValidator("http://foo.com", "1st"), - lookslike.Compose( - contJobValidator("http://foo.com", "2nd"), - summarizertesthelper.SummaryValidator(expected.js.Up, expected.js.Down), - hbtestllext.MaybeHasDuration, - ), }, nil, nil,