From d3a4bccf2c9a1b6f1775e8da2d2d1ce072109231 Mon Sep 17 00:00:00 2001 From: Aaditya Sondhi <20070511+aadityasondhi@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:07:23 -0500 Subject: [PATCH] roachtest: snapshot ingest roachtest improvements This patch contains some small improvements to better test the bandwidth subtest of the snapshot ingest roachtest. Informs #86857 Release note: None --- .../admission_control_snapshot_overload_io.go | 93 ++++++++++--------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go index 9f664be609d4..12b2e3f80297 100644 --- a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go +++ b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go @@ -70,9 +70,10 @@ func registerSnapshotOverloadIO(r registry.Registry) { limitDiskBandwidth: false, readPercent: 75, workloadBlockBytes: 12288, + rebalanceRate: "256MiB", })) - // This tests the behaviour of snpashot ingestion in bandwidth constrained + // This tests the behaviour of snapshot ingestion in bandwidth constrained // environments. r.Add(spec("bandwidth", admissionControlSnapshotOverloadIOOpts{ // 2x headroom from the ~500GB pre-population of the test. @@ -81,6 +82,7 @@ func registerSnapshotOverloadIO(r registry.Registry) { limitDiskBandwidth: true, readPercent: 20, workloadBlockBytes: 1024, + rebalanceRate: "1GiB", })) } @@ -91,6 +93,7 @@ type admissionControlSnapshotOverloadIOOpts struct { limitDiskBandwidth bool readPercent int workloadBlockBytes int + rebalanceRate string } func runAdmissionControlSnapshotOverloadIO( @@ -138,9 +141,9 @@ func runAdmissionControlSnapshotOverloadIO( t.Fatalf("failed to set storage.ingest_split.enabled: %v", err) } - // Set a high rebalance rate. + // Set rebalance rate. if _, err := db.ExecContext( - ctx, "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '256MiB'"); err != nil { + ctx, fmt.Sprintf("SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '%s'", cfg.rebalanceRate)); err != nil { t.Fatalf("failed to set kv.snapshot_rebalance.max_rate: %v", err) } } @@ -262,50 +265,50 @@ func runAdmissionControlSnapshotOverloadIO( return float64(fromVec[0].Value), nil } - // TODO(aaditya): assert on disk bandwidth subtest once integrated. - if !cfg.limitDiskBandwidth { - // Assert on l0 sublevel count and p99 latencies. - latencyMetric := divQuery("histogram_quantile(0.99, sum by(le) (rate(sql_service_latency_bucket[2m])))", 1<<20 /* 1ms */) - const latencyThreshold = 100 // 100ms since the metric is scaled to 1ms above. - const sublevelMetric = "storage_l0_sublevels" - const sublevelThreshold = 20 - var l0SublevelCount []float64 - const sampleCountForL0Sublevel = 12 - const collectionIntervalSeconds = 10.0 - // Loop for ~120 minutes. - const numIterations = int(120 / (collectionIntervalSeconds / 60)) - numErrors := 0 - numSuccesses := 0 - for i := 0; i < numIterations; i++ { - time.Sleep(collectionIntervalSeconds * time.Second) - val, err := getHistMetricVal(latencyMetric) - if err != nil { - numErrors++ - continue - } - if val > latencyThreshold { - t.Fatalf("sql p99 latency %f exceeded threshold", val) - } - val, err = getMetricVal(sublevelMetric, "store") - if err != nil { - numErrors++ - continue - } - l0SublevelCount = append(l0SublevelCount, val) - // We want to use the mean of the last 2m of data to avoid short-lived - // spikes causing failures. - if len(l0SublevelCount) >= sampleCountForL0Sublevel { - latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount) - if latestSampleMeanL0Sublevels > sublevelThreshold { - t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel) - } - } - numSuccesses++ + // Assert on l0 sublevel count and p99 latencies. + // + // TODO(aaditya): Add disk bandwidth assertion once + // https://github.com/cockroachdb/cockroach/pull/133310 lands. + latencyMetric := divQuery("histogram_quantile(0.99, sum by(le) (rate(sql_service_latency_bucket[2m])))", 1<<20 /* 1ms */) + const latencyThreshold = 100 // 100ms since the metric is scaled to 1ms above. + const sublevelMetric = "storage_l0_sublevels" + const sublevelThreshold = 20 + var l0SublevelCount []float64 + const sampleCountForL0Sublevel = 12 + const collectionIntervalSeconds = 10.0 + // Loop for ~120 minutes. + const numIterations = int(120 / (collectionIntervalSeconds / 60)) + numErrors := 0 + numSuccesses := 0 + for i := 0; i < numIterations; i++ { + time.Sleep(collectionIntervalSeconds * time.Second) + val, err := getHistMetricVal(latencyMetric) + if err != nil { + numErrors++ + continue } - t.Status(fmt.Sprintf("done monitoring, errors: %d successes: %d", numErrors, numSuccesses)) - if numErrors > numSuccesses { - t.Fatalf("too many errors retrieving metrics") + if val > latencyThreshold { + t.Fatalf("sql p99 latency %f exceeded threshold", val) } + val, err = getMetricVal(sublevelMetric, "store") + if err != nil { + numErrors++ + continue + } + l0SublevelCount = append(l0SublevelCount, val) + // We want to use the mean of the last 2m of data to avoid short-lived + // spikes causing failures. + if len(l0SublevelCount) >= sampleCountForL0Sublevel { + latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount) + if latestSampleMeanL0Sublevels > sublevelThreshold { + t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel) + } + } + numSuccesses++ + } + t.Status(fmt.Sprintf("done monitoring, errors: %d successes: %d", numErrors, numSuccesses)) + if numErrors > numSuccesses { + t.Fatalf("too many errors retrieving metrics") } return nil })