Skip to content

Commit

Permalink
roachtest: snapshot ingest roachtest improvements
Browse files Browse the repository at this point in the history
This patch contains some small improvements to better test the bandwidth
subtest of the snapshot ingest roachtest.

Informs cockroachdb#86857

Release note: None
  • Loading branch information
aadityasondhi committed Nov 26, 2024
1 parent aded21f commit d3a4bcc
Showing 1 changed file with 48 additions and 45 deletions.
93 changes: 48 additions & 45 deletions pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ func registerSnapshotOverloadIO(r registry.Registry) {
limitDiskBandwidth: false,
readPercent: 75,
workloadBlockBytes: 12288,
rebalanceRate: "256MiB",
}))

// This tests the behaviour of snpashot ingestion in bandwidth constrained
// This tests the behaviour of snapshot ingestion in bandwidth constrained
// environments.
r.Add(spec("bandwidth", admissionControlSnapshotOverloadIOOpts{
// 2x headroom from the ~500GB pre-population of the test.
Expand All @@ -81,6 +82,7 @@ func registerSnapshotOverloadIO(r registry.Registry) {
limitDiskBandwidth: true,
readPercent: 20,
workloadBlockBytes: 1024,
rebalanceRate: "1GiB",
}))

}
Expand All @@ -91,6 +93,7 @@ type admissionControlSnapshotOverloadIOOpts struct {
limitDiskBandwidth bool
readPercent int
workloadBlockBytes int
rebalanceRate string
}

func runAdmissionControlSnapshotOverloadIO(
Expand Down Expand Up @@ -138,9 +141,9 @@ func runAdmissionControlSnapshotOverloadIO(
t.Fatalf("failed to set storage.ingest_split.enabled: %v", err)
}

// Set a high rebalance rate.
// Set rebalance rate.
if _, err := db.ExecContext(
ctx, "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '256MiB'"); err != nil {
ctx, fmt.Sprintf("SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '%s'", cfg.rebalanceRate)); err != nil {
t.Fatalf("failed to set kv.snapshot_rebalance.max_rate: %v", err)
}
}
Expand Down Expand Up @@ -262,50 +265,50 @@ func runAdmissionControlSnapshotOverloadIO(
return float64(fromVec[0].Value), nil
}

// TODO(aaditya): assert on disk bandwidth subtest once integrated.
if !cfg.limitDiskBandwidth {
// Assert on l0 sublevel count and p99 latencies.
latencyMetric := divQuery("histogram_quantile(0.99, sum by(le) (rate(sql_service_latency_bucket[2m])))", 1<<20 /* 1ms */)
const latencyThreshold = 100 // 100ms since the metric is scaled to 1ms above.
const sublevelMetric = "storage_l0_sublevels"
const sublevelThreshold = 20
var l0SublevelCount []float64
const sampleCountForL0Sublevel = 12
const collectionIntervalSeconds = 10.0
// Loop for ~120 minutes.
const numIterations = int(120 / (collectionIntervalSeconds / 60))
numErrors := 0
numSuccesses := 0
for i := 0; i < numIterations; i++ {
time.Sleep(collectionIntervalSeconds * time.Second)
val, err := getHistMetricVal(latencyMetric)
if err != nil {
numErrors++
continue
}
if val > latencyThreshold {
t.Fatalf("sql p99 latency %f exceeded threshold", val)
}
val, err = getMetricVal(sublevelMetric, "store")
if err != nil {
numErrors++
continue
}
l0SublevelCount = append(l0SublevelCount, val)
// We want to use the mean of the last 2m of data to avoid short-lived
// spikes causing failures.
if len(l0SublevelCount) >= sampleCountForL0Sublevel {
latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount)
if latestSampleMeanL0Sublevels > sublevelThreshold {
t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel)
}
}
numSuccesses++
// Assert on l0 sublevel count and p99 latencies.
//
// TODO(aaditya): Add disk bandwidth assertion once
// https://github.com/cockroachdb/cockroach/pull/133310 lands.
latencyMetric := divQuery("histogram_quantile(0.99, sum by(le) (rate(sql_service_latency_bucket[2m])))", 1<<20 /* 1ms */)
const latencyThreshold = 100 // 100ms since the metric is scaled to 1ms above.
const sublevelMetric = "storage_l0_sublevels"
const sublevelThreshold = 20
var l0SublevelCount []float64
const sampleCountForL0Sublevel = 12
const collectionIntervalSeconds = 10.0
// Loop for ~120 minutes.
const numIterations = int(120 / (collectionIntervalSeconds / 60))
numErrors := 0
numSuccesses := 0
for i := 0; i < numIterations; i++ {
time.Sleep(collectionIntervalSeconds * time.Second)
val, err := getHistMetricVal(latencyMetric)
if err != nil {
numErrors++
continue
}
t.Status(fmt.Sprintf("done monitoring, errors: %d successes: %d", numErrors, numSuccesses))
if numErrors > numSuccesses {
t.Fatalf("too many errors retrieving metrics")
if val > latencyThreshold {
t.Fatalf("sql p99 latency %f exceeded threshold", val)
}
val, err = getMetricVal(sublevelMetric, "store")
if err != nil {
numErrors++
continue
}
l0SublevelCount = append(l0SublevelCount, val)
// We want to use the mean of the last 2m of data to avoid short-lived
// spikes causing failures.
if len(l0SublevelCount) >= sampleCountForL0Sublevel {
latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount)
if latestSampleMeanL0Sublevels > sublevelThreshold {
t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel)
}
}
numSuccesses++
}
t.Status(fmt.Sprintf("done monitoring, errors: %d successes: %d", numErrors, numSuccesses))
if numErrors > numSuccesses {
t.Fatalf("too many errors retrieving metrics")
}
return nil
})
Expand Down

0 comments on commit d3a4bcc

Please sign in to comment.