prometheus · brian-brazil · Sep 10, 2019 · Sep 4, 2019 · Sep 4, 2019 · Sep 5, 2019
diff --git a/history.go b/history.go
@@ -26,10 +26,12 @@ type result struct {
 }
 
 type resultHistory struct {
-	mu         sync.Mutex
-	nextId     int64
-	results    []*result
-	maxResults uint
+	mu                        sync.Mutex
+	nextId                    int64
+	results                   []*result
+	maxResults                uint
+	preservedFailedResults    []*result
+	maxPreservedFailedResults uint
 }
 
 // Add a result to the history.
@@ -48,6 +50,14 @@ func (rh *resultHistory) Add(moduleName, target, debugOutput string, success boo
 
 	rh.results = append(rh.results, r)
 	if uint(len(rh.results)) > rh.maxResults {
+		if !rh.results[0].success {
+			rh.preservedFailedResults = append(rh.preservedFailedResults, rh.results[0])
+			if uint(len(rh.preservedFailedResults)) > rh.maxPreservedFailedResults {
+				preservedFailedResults := make([]*result, len(rh.preservedFailedResults)-1)
+				copy(preservedFailedResults, rh.preservedFailedResults[1:])
+				rh.preservedFailedResults = preservedFailedResults
+			}
+		}
 		results := make([]*result, len(rh.results)-1)
 		copy(results, rh.results[1:])
 		rh.results = results
@@ -62,11 +72,24 @@ func (rh *resultHistory) List() []*result {
 	return rh.results[:]
 }
 
+// ListPreservedFailures returns a list of all preserved failed results.
+func (rh *resultHistory) ListPreservedFailures() []*result {
+	rh.mu.Lock()
+	defer rh.mu.Unlock()
+
+	return rh.preservedFailedResults[:]
+}
+
 // Get returns a given result.
 func (rh *resultHistory) Get(id int64) *result {
 	rh.mu.Lock()
 	defer rh.mu.Unlock()
 
+	for _, r := range rh.preservedFailedResults {
+		if r.id == id {
+			return r
+		}
+	}
 	for _, r := range rh.results {
 		if r.id == id {
 			return r

diff --git a/history_test.go b/history_test.go
@@ -0,0 +1,99 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestHistoryKeepsLatestResults(t *testing.T) {
+	history := &resultHistory{maxResults: 3, maxPreservedFailedResults: 3}
+	for i := 0; i < 4; i++ {
+		history.Add("module", "target", fmt.Sprintf("result %d", i), true)
+	}
+
+	savedResults := history.List()
+	for i := 0; i < len(savedResults); i++ {
+		if savedResults[i].debugOutput != fmt.Sprintf("result %d", i+1) {
+			t.Errorf("History contained the wrong result at index %d", i)
+		}
+	}
+}
+
+func FillHistoryWithMaxSuccesses(h *resultHistory) {
+	for i := uint(0); i < h.maxResults; i++ {
+		h.Add("module", "target", fmt.Sprintf("result %d", h.nextId), true)
+	}
+}
+
+func FillHistoryWithMaxPreservedFailures(h *resultHistory) {
+	for i := uint(0); i < h.maxPreservedFailedResults; i++ {
+		h.Add("module", "target", fmt.Sprintf("result %d", h.nextId), false)
+	}
+}
+
+func TestHistoryPreservesExpiredFailedResults(t *testing.T) {
+	history := &resultHistory{maxResults: 3, maxPreservedFailedResults: 3}
+
+	// Success are expired, no failues are expired
+	FillHistoryWithMaxSuccesses(history)
+	FillHistoryWithMaxPreservedFailures(history)
+	savedResults := history.List()
+	savedFailedResults := history.ListPreservedFailures()
+	if len(savedFailedResults) > 0 {
+		t.Errorf("Preserved failures contains failures unnecessarily.")
+	}
+	for i := uint(0); i < uint(len(savedResults)); i++ {
+		expectedDebugOutput := fmt.Sprintf("result %d", i+history.maxResults)
+		if savedResults[i].debugOutput != expectedDebugOutput {
+			t.Errorf("History contained the wrong result at index %d. Expected: %s, Actual: %s", i, expectedDebugOutput, savedResults[i].debugOutput)
+		}
+	}
+
+	// Failures are expired, should all be preserved
+	FillHistoryWithMaxPreservedFailures(history)
+	savedResults = history.List()
+	savedFailedResults = history.ListPreservedFailures()
+	for i := uint(0); i < uint(len(savedFailedResults)); i++ {
+		expectedDebugOutput := fmt.Sprintf("result %d", i+history.maxResults)
+		if savedFailedResults[i].debugOutput != expectedDebugOutput {
+			t.Errorf("History contained the wrong result at index %d. Expected: %s, Actual: %s", i, expectedDebugOutput, savedResults[i].debugOutput)
+		}
+	}
+	for i := uint(0); i < uint(len(savedResults)); i++ {
+		expectedDebugOutput := fmt.Sprintf("result %d", i+history.maxResults+history.maxPreservedFailedResults)
+		if savedResults[i].debugOutput != expectedDebugOutput {
+			t.Errorf("History contained the wrong result at index %d. Expected: %s, Actual: %s", i, expectedDebugOutput, savedResults[i].debugOutput)
+		}
+	}
+
+	// New expired failures are preserved, new success are not expired
+	FillHistoryWithMaxPreservedFailures(history)
+	FillHistoryWithMaxSuccesses(history)
+	savedResults = history.List()
+	savedFailedResults = history.ListPreservedFailures()
+	for i := uint(0); i < uint(len(savedFailedResults)); i++ {
+		expectedDebugOutput := fmt.Sprintf("result %d", i+history.maxResults+history.maxPreservedFailedResults*2)
+		if savedFailedResults[i].debugOutput != expectedDebugOutput {
+			t.Errorf("History contained the wrong result at index %d. Expected: %s, Actual: %s", i, expectedDebugOutput, savedResults[i].debugOutput)
+		}
+	}
+	for i := uint(0); i < uint(len(savedResults)); i++ {
+		expectedDebugOutput := fmt.Sprintf("result %d", i+history.maxResults+history.maxPreservedFailedResults*3)
+		if savedResults[i].debugOutput != expectedDebugOutput {
+			t.Errorf("History contained the wrong result at index %d. Expected: %s, Actual: %s", i, expectedDebugOutput, savedResults[i].debugOutput)
+		}
+	}
+}
diff --git a/main.go b/main.go
@@ -46,11 +46,12 @@ var (
 		C: &config.Config{},
 	}
 
-	configFile    = kingpin.Flag("config.file", "Blackbox exporter configuration file.").Default("blackbox.yml").String()
-	listenAddress = kingpin.Flag("web.listen-address", "The address to listen on for HTTP requests.").Default(":9115").String()
-	timeoutOffset = kingpin.Flag("timeout-offset", "Offset to subtract from timeout in seconds.").Default("0.5").Float64()
-	configCheck   = kingpin.Flag("config.check", "If true validate the config file and then exit.").Default().Bool()
-	historyLimit  = kingpin.Flag("history.limit", "The maximum amount of items to keep in the history.").Default("100").Uint()
+	configFile                  = kingpin.Flag("config.file", "Blackbox exporter configuration file.").Default("blackbox.yml").String()
+	listenAddress               = kingpin.Flag("web.listen-address", "The address to listen on for HTTP requests.").Default(":9115").String()
+	timeoutOffset               = kingpin.Flag("timeout-offset", "Offset to subtract from timeout in seconds.").Default("0.5").Float64()
+	configCheck                 = kingpin.Flag("config.check", "If true validate the config file and then exit.").Default().Bool()
+	historyLimit                = kingpin.Flag("history.limit", "The maximum amount of items to keep in the history.").Default("100").Uint()
+	historyPreservedFailedLimit = kingpin.Flag("history.preserved-failed-limit", "The maximum amount of failed items to preserve after expiration.").Default("5").Uint()
 
 	Probers = map[string]prober.ProbeFn{
 		"http": prober.ProbeHTTP,
@@ -200,7 +201,7 @@ func run() int {
 	kingpin.HelpFlag.Short('h')
 	kingpin.Parse()
 	logger := promlog.New(promlogConfig)
-	rh := &resultHistory{maxResults: *historyLimit}
+	rh := &resultHistory{maxResults: *historyLimit, maxPreservedFailedResults: *historyPreservedFailedLimit}
 
 	level.Info(logger).Log("msg", "Starting blackbox_exporter", "version", version.Info())
 	level.Info(logger).Log("msg", "Build context", version.BuildContext())
@@ -287,7 +288,24 @@ func run() int {
 				html.EscapeString(r.moduleName), html.EscapeString(r.target), success, r.id)
 		}
 
-		w.Write([]byte(`</table></body>
+		w.Write([]byte(`</table>
+		<h2>Preserved Failed Probes</h2>
+    <table border='1'><tr><th>Module</th><th>Target</th><th>Result</th><th>Debug</th>`))
+
+		preservedFailedResults := rh.ListPreservedFailures()
+
+		for i := len(preservedFailedResults) - 1; i >= 0; i-- {
+			r := preservedFailedResults[i]
+			success := "Success"
+			if !r.success {
+				success = "<strong>Failure</strong>"
+			}
+			fmt.Fprintf(w, "<tr><td>%s</td><td>%s</td><td>%s</td><td><a href='logs?id=%d'>Logs</a></td></td>",
+				html.EscapeString(r.moduleName), html.EscapeString(r.target), success, r.id)
+		}
+
+		w.Write([]byte(`</table>
+		</body>
     </html>`))
 	})