Skip to content

Commit

Permalink
Merge #104885
Browse files Browse the repository at this point in the history
104885: roachtest: reduce rebalance-by-load noise r=kvoli a=kvoli

The rebalance-by-load tests assert that the normalized [0,1] CPU utilization of each node is within some threshold of the mean. The threshold was previously 10%, however it is not unexpected that despite replica load being within this threshold, that total node load is not.

The current balancing implementation only concerns itself with replica load.

Bump the tolerance from 10% to 15% to reduce noise.

Additionally, the test did not wait for 3x replication prior to beginning the workload. This is bound to introduce flakes eventually. Wait for 3x replication before beginning.

Resolves: #104854
Resolves: #104386

Release note: None

Co-authored-by: Austen McClernon <[email protected]>
  • Loading branch information
craig[bot] and kvoli committed Jun 15, 2023
2 parents 13b9ad0 + 1fefe10 commit 8d9cbc4
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions pkg/cmd/roachtest/tests/rebalance_load.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ const (
//
// mean_tolerance = mean * meanCPUTolerance
// [mean - mean_tolerance, mean + mean_tolerance].
meanCPUTolerance = 0.1
//
// The store rebalancer watches the replica CPU load and balances within
// +-10% of the mean. To reduce noise, add a buffer (5%) ontop.
meanCPUTolerance = 0.15
// statSamplePeriod is the period at which timeseries stats are sampled.
statSamplePeriod = 10 * time.Second
)
Expand Down Expand Up @@ -105,6 +108,16 @@ func registerRebalanceLoad(r registry.Registry) {
c.Put(ctx, t.DeprecatedWorkload(), "./workload", appNode)
c.Run(ctx, appNode, fmt.Sprintf("./workload init kv --drop --splits=%d {pgurl:1}", splits))

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

require.NoError(t, WaitFor3XReplication(ctx, t, db))
t.Status("disable load based splitting")
require.NoError(t, disableLoadBasedSplitting(ctx, db))
t.Status(fmt.Sprintf("setting rebalance mode to %s", rebalanceMode))
_, err := db.ExecContext(ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode)
require.NoError(t, err)

var m *errgroup.Group // see comment in version.go
m, ctx = errgroup.WithContext(ctx)

Expand All @@ -115,7 +128,6 @@ func registerRebalanceLoad(r registry.Registry) {

m.Go(func() error {
t.L().Printf("starting load generator\n")

err := c.RunE(ctx, appNode, fmt.Sprintf(
"./workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
"--duration=%v {pgurl:1-%d}",
Expand All @@ -132,20 +144,6 @@ func registerRebalanceLoad(r registry.Registry) {
m.Go(func() error {
t.Status("checking for CPU balance")

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

t.Status("disable load based splitting")
if err := disableLoadBasedSplitting(ctx, db); err != nil {
return err
}

if _, err := db.ExecContext(
ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode,
); err != nil {
return err
}

storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores)
if err != nil {
return err
Expand Down Expand Up @@ -196,7 +194,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency, false /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, false /* mixedVersion */)
},
},
)
Expand All @@ -210,7 +208,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency, true /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, true /* mixedVersion */)
},
},
)
Expand Down

0 comments on commit 8d9cbc4

Please sign in to comment.