Skip to content

Commit

Permalink
roachtest: reduce rebalance-by-load noise
Browse files Browse the repository at this point in the history
The rebalance-by-load tests assert that the normalized [0,1] CPU
utilization of each node is within some threshold of the mean. The
threshold was previously 10%, however it is not unexpected that despite
replica load being within this threshold, that total node load is not.

The current balancing implementation only concerns itself with replica
load.

Bump the tolerance from 10% to 15% to reduce noise.

Additionally, the test did not wait for 3x replication prior to
beginning the workload. This is bound to introduce flakes eventually.
Wait for 3x replication before beginning.

Resolves: #104854
Resolves: #104386

Release note: None
  • Loading branch information
kvoli committed Jun 27, 2023
1 parent 8661eed commit d7b46c1
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions pkg/cmd/roachtest/tests/rebalance_load.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ const (
//
// mean_tolerance = mean * meanCPUTolerance
// [mean - mean_tolerance, mean + mean_tolerance].
meanCPUTolerance = 0.1
//
// The store rebalancer watches the replica CPU load and balances within
// +-10% of the mean. To reduce noise, add a buffer (5%) ontop.
meanCPUTolerance = 0.15
// statSamplePeriod is the period at which timeseries stats are sampled.
statSamplePeriod = 10 * time.Second
)
Expand Down Expand Up @@ -103,6 +106,16 @@ func registerRebalanceLoad(r registry.Registry) {
c.Put(ctx, t.DeprecatedWorkload(), "./workload", appNode)
c.Run(ctx, appNode, fmt.Sprintf("./workload init kv --drop --splits=%d {pgurl:1}", splits))

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

require.NoError(t, WaitFor3XReplication(ctx, t, db))
t.Status("disable load based splitting")
require.NoError(t, disableLoadBasedSplitting(ctx, db))
t.Status(fmt.Sprintf("setting rebalance mode to %s", rebalanceMode))
_, err := db.ExecContext(ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode)
require.NoError(t, err)

var m *errgroup.Group // see comment in version.go
m, ctx = errgroup.WithContext(ctx)

Expand All @@ -113,7 +126,6 @@ func registerRebalanceLoad(r registry.Registry) {

m.Go(func() error {
t.L().Printf("starting load generator\n")

err := c.RunE(ctx, appNode, fmt.Sprintf(
"./workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
"--duration=%v {pgurl:1-%d}",
Expand All @@ -130,20 +142,6 @@ func registerRebalanceLoad(r registry.Registry) {
m.Go(func() error {
t.Status("checking for CPU balance")

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

t.Status("disable load based splitting")
if err := disableLoadBasedSplitting(ctx, db); err != nil {
return err
}

if _, err := db.ExecContext(
ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode,
); err != nil {
return err
}

storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores)
if err != nil {
return err
Expand Down Expand Up @@ -193,7 +191,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency, false /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, false /* mixedVersion */)
},
},
)
Expand All @@ -207,7 +205,7 @@ func registerRebalanceLoad(r registry.Registry) {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
}
rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency, true /* mixedVersion */)
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, true /* mixedVersion */)
},
},
)
Expand Down

0 comments on commit d7b46c1

Please sign in to comment.