Skip to content

Commit

Permalink
chore: Remove flaky TestRulerEvaluationDelay test (#9741)
Browse files Browse the repository at this point in the history
The test is unreliable and we've been unable to figure out why it
sometimes fails. This leads to people blindly re-running failing CI
instead of assuming it has caught a legitimate issue. Remove the test
since it reduces confidence in the test suite.

Fixes #4857
  • Loading branch information
56quarters authored Oct 28, 2024
1 parent ee9fdc8 commit 4e0f80e
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 137 deletions.
8 changes: 0 additions & 8 deletions integration/configs.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,6 @@ receivers:
labels: {}
annotations: {}
`

mimirRulerEvalStaleNanConfigYaml = `groups:
- name: rule
interval: 1s
rules:
- record: stale_nan_eval
expr: a_sometimes_stale_nan_series * 2
`
)

var (
Expand Down
129 changes: 0 additions & 129 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/rulefmt"
"github.com/prometheus/prometheus/model/value"
"github.com/prometheus/prometheus/prompb"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -204,134 +203,6 @@ func TestRulerAPISingleBinary(t *testing.T) {
require.NoError(t, mimirRestarted.WaitSumMetrics(e2e.Equals(1), "cortex_ruler_managers_total"))
}

func TestRulerEvaluationDelay(t *testing.T) {
s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

namespace := "ns"
user := "anonymous"

evaluationDelay := time.Minute * 5

// Start dependencies.
minio := e2edb.NewMinio(9000, blocksBucketName)
require.NoError(t, s.StartAndWaitReady(minio))

flags := mergeFlags(
BlocksStorageFlags(),
BlocksStorageS3Flags(),
map[string]string{
"-ruler-storage.local.directory": filepath.Join(e2e.ContainerSharedDir, "ruler_configs"),
"-ruler.poll-interval": "2s",
"-ruler.rule-path": filepath.Join(e2e.ContainerSharedDir, "rule_tmp/"),
"-ruler.evaluation-delay-duration": evaluationDelay.String(),
},
)

// Start Mimir components.
require.NoError(t, copyFileToSharedDir(s, "docs/configurations/single-process-config-blocks.yaml", mimirConfigFile))
require.NoError(t, writeFileToSharedDir(s, filepath.Join("ruler_configs", user, namespace), []byte(mimirRulerEvalStaleNanConfigYaml)))
mimir := e2emimir.NewSingleBinary("mimir", flags, e2emimir.WithConfigFile(mimirConfigFile), e2emimir.WithPorts(9009, 9095))
require.NoError(t, s.StartAndWaitReady(mimir))

// Create a client with the ruler address configured
c, err := e2emimir.NewClient(mimir.HTTPEndpoint(), mimir.HTTPEndpoint(), "", mimir.HTTPEndpoint(), "")
require.NoError(t, err)

now := time.Now()

// Generate series that includes stale nans
var samplesToSend = 10
series := prompb.TimeSeries{
Labels: []prompb.Label{
{Name: "__name__", Value: "a_sometimes_stale_nan_series"},
{Name: "instance", Value: "sometimes-stale"},
},
}
series.Samples = make([]prompb.Sample, samplesToSend)
posStale := 2

// Create samples, that are delayed by the evaluation delay with increasing values.
for pos := range series.Samples {
series.Samples[pos].Timestamp = e2e.TimeToMilliseconds(now.Add(-evaluationDelay).Add(time.Duration(pos) * time.Second))
series.Samples[pos].Value = float64(pos + 1)

// insert staleness marker at the positions marked by posStale
if pos == posStale {
series.Samples[pos].Value = math.Float64frombits(value.StaleNaN)
}
}

// Insert metrics
res, err := c.Push([]prompb.TimeSeries{series})
require.NoError(t, err)
require.Equal(t, 200, res.StatusCode)

// Get number of rule evaluations just after push
ruleEvaluationsAfterPush, err := mimir.SumMetrics([]string{"cortex_prometheus_rule_evaluations_total"})
require.NoError(t, err)

// Wait until the rule is evaluated for the first time
require.NoError(t, mimir.WaitSumMetrics(e2e.Greater(ruleEvaluationsAfterPush[0]), "cortex_prometheus_rule_evaluations_total"))

// Query the timestamp of the latest result to ensure the evaluation is delayed
result, err := c.Query("timestamp(stale_nan_eval)", now)
require.NoError(t, err)
require.Equal(t, model.ValVector, result.Type())

vector := result.(model.Vector)
require.Equal(t, 1, vector.Len(), "expect one sample returned")

// 290 seconds gives 10 seconds of slack between the rule evaluation and the query
// to account for CI latency, but ensures the latest evaluation was in the past.
var maxDiff int64 = 290_000
require.GreaterOrEqual(t, e2e.TimeToMilliseconds(time.Now())-int64(vector[0].Value)*1000, maxDiff)

// Wait until all the pushed samples have been evaluated by the rule. This
// ensures that rule results are successfully written even after a
// staleness period.
require.NoError(t, mimir.WaitSumMetrics(e2e.Greater(ruleEvaluationsAfterPush[0]+float64(samplesToSend)), "cortex_prometheus_rule_evaluations_total"))

// query all results to verify rules have been evaluated correctly
t.Log("querying from ", now.Add(-evaluationDelay), "to", now)
result, err = c.QueryRange("stale_nan_eval", now.Add(-evaluationDelay), now, time.Second)
require.NoError(t, err)
require.Equal(t, model.ValMatrix, result.Type())

matrix := result.(model.Matrix)
require.GreaterOrEqual(t, 1, matrix.Len(), "expect at least a series returned")

// Iterate through the values recorded and ensure they exist as expected.
inputPos := 0
for _, m := range matrix {
for _, v := range m.Values {
// Skip values for stale positions
if inputPos == posStale {
inputPos++
}

expectedValue := model.SampleValue(2 * (inputPos + 1))
assert.Equal(t, expectedValue, v.Value)
t.Log(
"expected value", expectedValue,
"actual value", v.Value,
"actual timestamp", v.Timestamp,
"expected timestamp", now.Add(-evaluationDelay).Add(time.Duration(inputPos)*time.Second),
)

// Look for next value
inputPos++

// We have found all input values
if inputPos >= len(series.Samples) {
break
}
}
}
assert.Equal(t, len(series.Samples), inputPos, "expect to have returned all evaluations")
}

func TestRulerSharding(t *testing.T) {
const numRulesGroups = 100

Expand Down

0 comments on commit 4e0f80e

Please sign in to comment.