From 87d8d741832e9b06554641fe659bfa184d5cef7d Mon Sep 17 00:00:00 2001 From: Alex Robinson Date: Tue, 7 Aug 2018 03:15:10 -0500 Subject: [PATCH] roachtest: Add test for load-based lease rebalancing It consistently passes with store-level load-based lease rebalancing, but fails more often than not without it. Release note: None --- pkg/cmd/roachtest/allocator.go | 2 +- pkg/cmd/roachtest/rebalance_load.go | 180 ++++++++++++++++++++++++++++ pkg/cmd/roachtest/registry.go | 1 + 3 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 pkg/cmd/roachtest/rebalance_load.go diff --git a/pkg/cmd/roachtest/allocator.go b/pkg/cmd/roachtest/allocator.go index 7fac79ade1cb..f30b4b8a55f3 100644 --- a/pkg/cmd/roachtest/allocator.go +++ b/pkg/cmd/roachtest/allocator.go @@ -33,7 +33,7 @@ func registerAllocator(r *registry) { c.Put(ctx, workload, "./workload") // Start the first `start` nodes and restore the fixture - args := startArgs("--args=--vmodule=allocator=5,allocator_scorer=5,replicate_queue=5") + args := startArgs("--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") c.Start(ctx, c.Range(1, start), args) db := c.Conn(ctx, 1) defer db.Close() diff --git a/pkg/cmd/roachtest/rebalance_load.go b/pkg/cmd/roachtest/rebalance_load.go new file mode 100644 index 000000000000..a2611e1e0e49 --- /dev/null +++ b/pkg/cmd/roachtest/rebalance_load.go @@ -0,0 +1,180 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. See the AUTHORS file +// for names of contributors. + +package main + +import ( + "context" + gosql "database/sql" + "fmt" + "io/ioutil" + "os" + "sort" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "golang.org/x/sync/errgroup" +) + +func registerRebalanceLoad(r *registry) { + // This test creates a single table for kv to use and splits the table to + // have one range for every node in the cluster. Because even brand new + // clusters start with 20+ ranges in them, the number of new ranges in kv's + // table is small enough that it typically won't trigger rebalancing of + // leases in the cluster based on lease count alone. We let kv generate a lot + // of load against the ranges such that when + // kv.allocator.stat_based_rebalancing.enabled is set to true, we'd expect + // load-based rebalancing to distribute the load evenly across the nodes in + // the cluster. Without that setting, the fact that the kv table has so few + // ranges means that they probablistically won't have their leases evenly + // spread across all the nodes (they'll often just end up staying on n1). + // + // In other words, this test should always pass with + // kv.allocator.stat_based_rebalancing.enabled set to true, while it should + // usually (but not always fail) with it set to false. + rebalanceLoadRun := func(ctx context.Context, t *test, c *cluster, duration time.Duration, concurrency int) { + roachNodes := c.Range(1, c.nodes-1) + appNode := c.Node(c.nodes) + + c.Put(ctx, cockroach, "./cockroach", roachNodes) + args := startArgs( + "--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") + c.Start(ctx, roachNodes, args) + + c.Put(ctx, workload, "./workload", appNode) + c.Run(ctx, appNode, `./workload init kv --drop {pgurl:1}`) + + var m *errgroup.Group // see comment in version.go + m, ctx = errgroup.WithContext(ctx) + + m.Go(func() error { + c.l.printf("starting load generator\n") + + quietL, err := newLogger("run kv", strconv.Itoa(0), "workload"+strconv.Itoa(0), ioutil.Discard, os.Stderr) + if err != nil { + return err + } + splits := len(roachNodes) - 1 // n-1 splits => n ranges => 1 lease per node + return c.RunL(ctx, quietL, appNode, fmt.Sprintf( + "./workload run kv --read-percent=95 --splits=%d --tolerate-errors --concurrency=%d "+ + "--duration=%s {pgurl:1-3}", + splits, concurrency, duration.String())) + }) + + m.Go(func() error { + t.Status(fmt.Sprintf("starting checks for lease balance")) + + db := c.Conn(ctx, 1) + defer db.Close() + + if _, err := db.ExecContext( + ctx, `SET CLUSTER SETTING kv.allocator.stat_based_rebalancing.enabled=true`, + ); err != nil { + return err + } + + for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= duration; { + if done, err := isLoadEvenlyDistributed(c.l, db, len(roachNodes)); err != nil { + return err + } else if done { + c.l.printf("successfully achieved lease balance\n") + return nil + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(5 * time.Second): + } + } + + return fmt.Errorf("timed out before leases were evenly spread") + }) + if err := m.Wait(); err != nil { + t.Fatal(err) + } + } + + minutes := 2 * time.Minute + numNodes := 4 // the last node is just used to generate load + concurrency := 128 + + r.Add(testSpec{ + Name: `rebalance-leases-by-load`, + Nodes: nodes(numNodes), + Stable: false, // TODO(a-robinson): Promote to stable + Run: func(ctx context.Context, t *test, c *cluster) { + if local { + concurrency = 32 + fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) + } + rebalanceLoadRun(ctx, t, c, minutes, concurrency) + }, + }) +} + +func isLoadEvenlyDistributed(l *logger, db *gosql.DB, numNodes int) (bool, error) { + rows, err := db.Query( + `select lease_holder, count(*) ` + + `from [show experimental_ranges from table kv.kv] ` + + `group by lease_holder;`) + if err != nil { + return false, err + } + defer rows.Close() + leaseCounts := make(map[int]int) + var rangeCount int + for rows.Next() { + var storeID, leaseCount int + if err := rows.Scan(&storeID, &leaseCount); err != nil { + return false, err + } + leaseCounts[storeID] = leaseCount + rangeCount += leaseCount + } + l.printf("numbers of test.kv leases on each store: %v\n", leaseCounts) + + if len(leaseCounts) < numNodes { + l.printf("not all nodes have a lease yet: %v\n", leaseCounts) + return false, nil + } + + // The simple case is when ranges haven't split. We can require that every + // store has one lease. + if rangeCount == numNodes { + for _, leaseCount := range leaseCounts { + if leaseCount != 1 { + l.printf("uneven lease distribution: %v\n", leaseCounts) + return false, nil + } + } + return true, nil + } + + // For completeness, if leases have split, verify the leases per store don't + // differ by any more than 1. + leases := make([]int, 0, numNodes) + for _, leaseCount := range leaseCounts { + leases = append(leases, leaseCount) + } + sort.Ints(leases) + if leases[0]+1 < leases[len(leases)-1] { + l.printf("leases per store differ by more than one: %v\n", leaseCounts) + return false, nil + } + + return true, nil +} diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go index 5d9319b83cc2..e425b7bd0998 100644 --- a/pkg/cmd/roachtest/registry.go +++ b/pkg/cmd/roachtest/registry.go @@ -45,6 +45,7 @@ func registerTests(r *registry) { registerKVSplits(r) registerLargeRange(r) registerQueue(r) + registerRebalanceLoad(r) registerRestore(r) registerRoachmart(r) registerScaleData(r)