From 4e0f80e4c05a071566cc9ffdfbee75bbb257f3a5 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:48:21 -0400 Subject: [PATCH] chore: Remove flaky TestRulerEvaluationDelay test (#9741) The test is unreliable and we've been unable to figure out why it sometimes fails. This leads to people blindly re-running failing CI instead of assuming it has caught a legitimate issue. Remove the test since it reduces confidence in the test suite. Fixes https://github.com/grafana/mimir/issues/4857 --- integration/configs.go | 8 --- integration/ruler_test.go | 129 -------------------------------------- 2 files changed, 137 deletions(-) diff --git a/integration/configs.go b/integration/configs.go index c1b1716fe2f..9c2e7ff3853 100644 --- a/integration/configs.go +++ b/integration/configs.go @@ -88,14 +88,6 @@ receivers: labels: {} annotations: {} ` - - mimirRulerEvalStaleNanConfigYaml = `groups: -- name: rule - interval: 1s - rules: - - record: stale_nan_eval - expr: a_sometimes_stale_nan_series * 2 -` ) var ( diff --git a/integration/ruler_test.go b/integration/ruler_test.go index 7e399bcb69d..2dc88d86964 100644 --- a/integration/ruler_test.go +++ b/integration/ruler_test.go @@ -27,7 +27,6 @@ import ( "github.com/prometheus/common/model" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/rulefmt" - "github.com/prometheus/prometheus/model/value" "github.com/prometheus/prometheus/prompb" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -204,134 +203,6 @@ func TestRulerAPISingleBinary(t *testing.T) { require.NoError(t, mimirRestarted.WaitSumMetrics(e2e.Equals(1), "cortex_ruler_managers_total")) } -func TestRulerEvaluationDelay(t *testing.T) { - s, err := e2e.NewScenario(networkName) - require.NoError(t, err) - defer s.Close() - - namespace := "ns" - user := "anonymous" - - evaluationDelay := time.Minute * 5 - - // Start dependencies. - minio := e2edb.NewMinio(9000, blocksBucketName) - require.NoError(t, s.StartAndWaitReady(minio)) - - flags := mergeFlags( - BlocksStorageFlags(), - BlocksStorageS3Flags(), - map[string]string{ - "-ruler-storage.local.directory": filepath.Join(e2e.ContainerSharedDir, "ruler_configs"), - "-ruler.poll-interval": "2s", - "-ruler.rule-path": filepath.Join(e2e.ContainerSharedDir, "rule_tmp/"), - "-ruler.evaluation-delay-duration": evaluationDelay.String(), - }, - ) - - // Start Mimir components. - require.NoError(t, copyFileToSharedDir(s, "docs/configurations/single-process-config-blocks.yaml", mimirConfigFile)) - require.NoError(t, writeFileToSharedDir(s, filepath.Join("ruler_configs", user, namespace), []byte(mimirRulerEvalStaleNanConfigYaml))) - mimir := e2emimir.NewSingleBinary("mimir", flags, e2emimir.WithConfigFile(mimirConfigFile), e2emimir.WithPorts(9009, 9095)) - require.NoError(t, s.StartAndWaitReady(mimir)) - - // Create a client with the ruler address configured - c, err := e2emimir.NewClient(mimir.HTTPEndpoint(), mimir.HTTPEndpoint(), "", mimir.HTTPEndpoint(), "") - require.NoError(t, err) - - now := time.Now() - - // Generate series that includes stale nans - var samplesToSend = 10 - series := prompb.TimeSeries{ - Labels: []prompb.Label{ - {Name: "__name__", Value: "a_sometimes_stale_nan_series"}, - {Name: "instance", Value: "sometimes-stale"}, - }, - } - series.Samples = make([]prompb.Sample, samplesToSend) - posStale := 2 - - // Create samples, that are delayed by the evaluation delay with increasing values. - for pos := range series.Samples { - series.Samples[pos].Timestamp = e2e.TimeToMilliseconds(now.Add(-evaluationDelay).Add(time.Duration(pos) * time.Second)) - series.Samples[pos].Value = float64(pos + 1) - - // insert staleness marker at the positions marked by posStale - if pos == posStale { - series.Samples[pos].Value = math.Float64frombits(value.StaleNaN) - } - } - - // Insert metrics - res, err := c.Push([]prompb.TimeSeries{series}) - require.NoError(t, err) - require.Equal(t, 200, res.StatusCode) - - // Get number of rule evaluations just after push - ruleEvaluationsAfterPush, err := mimir.SumMetrics([]string{"cortex_prometheus_rule_evaluations_total"}) - require.NoError(t, err) - - // Wait until the rule is evaluated for the first time - require.NoError(t, mimir.WaitSumMetrics(e2e.Greater(ruleEvaluationsAfterPush[0]), "cortex_prometheus_rule_evaluations_total")) - - // Query the timestamp of the latest result to ensure the evaluation is delayed - result, err := c.Query("timestamp(stale_nan_eval)", now) - require.NoError(t, err) - require.Equal(t, model.ValVector, result.Type()) - - vector := result.(model.Vector) - require.Equal(t, 1, vector.Len(), "expect one sample returned") - - // 290 seconds gives 10 seconds of slack between the rule evaluation and the query - // to account for CI latency, but ensures the latest evaluation was in the past. - var maxDiff int64 = 290_000 - require.GreaterOrEqual(t, e2e.TimeToMilliseconds(time.Now())-int64(vector[0].Value)*1000, maxDiff) - - // Wait until all the pushed samples have been evaluated by the rule. This - // ensures that rule results are successfully written even after a - // staleness period. - require.NoError(t, mimir.WaitSumMetrics(e2e.Greater(ruleEvaluationsAfterPush[0]+float64(samplesToSend)), "cortex_prometheus_rule_evaluations_total")) - - // query all results to verify rules have been evaluated correctly - t.Log("querying from ", now.Add(-evaluationDelay), "to", now) - result, err = c.QueryRange("stale_nan_eval", now.Add(-evaluationDelay), now, time.Second) - require.NoError(t, err) - require.Equal(t, model.ValMatrix, result.Type()) - - matrix := result.(model.Matrix) - require.GreaterOrEqual(t, 1, matrix.Len(), "expect at least a series returned") - - // Iterate through the values recorded and ensure they exist as expected. - inputPos := 0 - for _, m := range matrix { - for _, v := range m.Values { - // Skip values for stale positions - if inputPos == posStale { - inputPos++ - } - - expectedValue := model.SampleValue(2 * (inputPos + 1)) - assert.Equal(t, expectedValue, v.Value) - t.Log( - "expected value", expectedValue, - "actual value", v.Value, - "actual timestamp", v.Timestamp, - "expected timestamp", now.Add(-evaluationDelay).Add(time.Duration(inputPos)*time.Second), - ) - - // Look for next value - inputPos++ - - // We have found all input values - if inputPos >= len(series.Samples) { - break - } - } - } - assert.Equal(t, len(series.Samples), inputPos, "expect to have returned all evaluations") -} - func TestRulerSharding(t *testing.T) { const numRulesGroups = 100