Skip to content

Commit

Permalink
roachtest: add admission/follower-overload
Browse files Browse the repository at this point in the history
This is a less ad-hoc version of the experiment in cockroachdb#81289, where I
messed with the EBS configuration. This can't be done programmatically,
and so here we use an IO nemesis on n3 instead.

Release note: None
  • Loading branch information
tbg committed Jun 23, 2022
1 parent 1aa57fa commit 4b9497d
Show file tree
Hide file tree
Showing 3 changed files with 283 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ go_library(
"acceptance.go",
"activerecord.go",
"activerecord_blocklist.go",
"admission_control_follower_overload.go",
"allocator.go",
"alterpk.go",
"asyncpg.go",
Expand Down
281 changes: 281 additions & 0 deletions pkg/cmd/roachtest/tests/admission_control_follower_overload.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package tests

import (
"context"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/prometheus"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/stretchr/testify/require"
)

func registerAdmissionControlFollowerOverload(r registry.Registry) {
spec := func(subtest string, cfg admissionControlFollowerOverloadOpts) registry.TestSpec {
return registry.TestSpec{
Name: "admission/follower-overload/" + subtest,
Owner: registry.OwnerKV,
Timeout: 3 * time.Hour,
// Don't re-use the cluster, since we're deploying a disk nemesis which isn't
// guaranteed to be shut down by termination of this test.
Cluster: r.MakeClusterSpec(4, spec.CPU(4), spec.ReuseNone(), spec.AvoidSSD(), spec.SSD(0), spec.MultipleStores(true)),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runAdmissionControlFollowerOverload(ctx, t, c, cfg)
},
}
}

// The control group - just the vanilla cluster workloads, no nemesis. Running
// this and looking at performance blips can give us an idea of what "normal"
// looks like. This is most directly contrasted with presplit-with-leases but
// since the workload on N3 barely needs any resources, it should also compare
// well with presplit-no-leases.
r.Add(spec("presplit-control", admissionControlFollowerOverloadOpts{
kv0N12: true,
kvN12ExtraArgs: "--splits 100",
kv50N3: true,
}))
// n3 has no leases (but has disk overload), so n1 and n2 field all of the
// active work but replicate to n3. The workload should be steady with good
// p99s since there is no backpressure from n3 (at the time of writing) and
// we're not sending it any foreground traffic. The quota pools shouldn't
// deplete since writes are spread out evenly across 100 ranges.
r.Add(spec("presplit-no-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--splits 100",
}))
// Everything as before, but now the writes aren't spread out but all hit the
// same range. This could lead to the quota pool on that range running
// significantly emptier, possibly to the point of stalling foreground writes.
r.Add(spec("hotspot-no-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--sequential",
kv50N3: true,
}))
// This is identical to presplit-no-leases, but this time we are also running a
// (small) workload against n3. Looking at the performance of this workload gives
// us an idea of the impact of follower writes overload on a foreground workload.
r.Add(spec("presplit-with-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--splits=100",
kv50N3: true,
}))

}

type admissionControlFollowerOverloadOpts struct {
ioNemesis bool
kv0N12 bool
kvN12ExtraArgs string
kv50N3 bool
}

func runAdmissionControlFollowerOverload(
ctx context.Context, t test.Test, c cluster.Cluster, cfg admissionControlFollowerOverloadOpts,
) {
require.False(t, c.IsLocal())

resetSystemdUnits := func() {
for _, cmd := range []string{"stop", "reset-failed"} {
_ = c.RunE(ctx, c.Node(3), "sudo", "systemctl", cmd, "fio-n3")
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n12")
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n3")
}
}

// Make cluster re-use possible to iterate on this test without making a new
// cluster every time.
const dev = true
if dev {
resetSystemdUnits()
}

// Set up prometheus.
{
clusNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Node(c.Spec().NodeCount)
promNode := workloadNode
cfg := (&prometheus.Config{}).
WithPrometheusNode(promNode).
WithGrafanaDashboard("https://gist.githubusercontent.com/tbg/f238d578269143187e71a1046562225f/raw").
WithCluster(clusNodes).
WithNodeExporter(clusNodes).
WithWorkload(workloadNode, 2112). // kv-n12
WithWorkload(workloadNode, 2113) // kv-n3 (if present)

p, saveSnap, err := prometheus.Init(
ctx,
*cfg,
c,
t.L(),
repeatRunner{C: c, T: t}.repeatRunE,
)
require.NoError(t, err)
defer saveSnap(t.ArtifactsDir())
_ = p
}

phaseDuration := 3 * time.Minute // TODO time.Hour
require.NoError(t, repeatRunE(ctx, t, c, c.Node(3), "download fio",
"sudo", "apt", "install", "-qyy", "fio", "iotop"))

nodes := c.Range(1, 3)
c.Put(ctx, t.Cockroach(), "cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), nodes)
db := c.Conn(ctx, t.L(), 1)
require.NoError(t, WaitFor3XReplication(ctx, t, db))

if cfg.kv0N12 {
args := strings.Fields("./cockroach workload init kv {pgurl:1}")
args = append(args, strings.Fields(cfg.kvN12ExtraArgs)...)
c.Run(ctx, c.Node(1), args...)
}
if cfg.kv50N3 {
args := strings.Fields("./cockroach workload init kv --db kvn3 {pgurl:1}")
c.Run(ctx, c.Node(1), args...)
}

// Node 3 should not have any leases (excepting kvn3, if present).
runner := sqlutils.MakeSQLRunner(db)
for _, row := range runner.QueryStr(
t, `SELECT target FROM [ SHOW ZONE CONFIGURATIONS ]`,
) {
q := `ALTER ` + row[0] + ` CONFIGURE ZONE USING lease_preferences = '[[-node3]]'`
t.L().Printf("%s", q)
_, err := db.Exec(q)
require.NoError(t, err)
}
if cfg.kv50N3 {
q := `ALTER DATABASE kvn3 CONFIGURE ZONE USING lease_preferences = '[[+node3]]', constraints = COPY FROM PARENT`
t.L().Printf("%s", q)
runner.Exec(t, q)
}

{
var attempts int
for ctx.Err() == nil {
attempts++
m1 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder=3 AND database_name != 'kvn3'`)
m2 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder!=3 AND database_name = 'kvn3'`)
if len(m1)+len(m2) == 0 {
t.L().Printf("done waiting for lease movement")
break
}
if len(m1) > 0 {
t.L().Printf("waiting for %d range leases to move off n3: %v", len(m1), m1)
}
if len(m2) > 0 {
t.L().Printf("waiting for %d range leases to move to n3: %v", len(m2), m2)
}

time.Sleep(10 * time.Second)
require.Less(t, attempts, 100)
}
}

if cfg.kv0N12 {
// Deploy workload against the default kv database (which has no leases on
// n3) and let it run for a phase duration. This does not block and keeps
// running even after the test tears down. Initially, the below workload was
// configured for 400 requests per second with 10k blocks, amounting to
// 4mb/s of goodput. Experimentally this was observed to cause (after ~8h) a
// per-store read throughput of ~60mb/s and write throughput of ~140mb/s for
// a total of close to 200mb/s (per store). This was too much for default
// EBS disks (see below) and there was unpredictable performance when
// reprovisioning such volumes with higher throughput, so we run at 2mb/s
// which should translate to ~100mb/s of max sustained combined throughput.
//
// NB: on GCE pd-ssd, we get 30 IOPS/GB of (combined) throughput and
// 0.45MB/(GB*s) for each GB provisioned, so for the 500GB volumes in this
// test 15k IOPS and 225MB/s.
//
// See: https://cloud.google.com/compute/docs/disks/performance#footnote-1
//
// On AWS, the default EBS volumes have 3000 IOPS and 125MB/s combined
// throughput.
deployWorkload := `
mkdir -p logs &&
sudo systemd-run --property=Type=exec
--property=StandardOutput=file:/home/ubuntu/logs/kv-n12.stdout.log
--property=StandardError=file:/home/ubuntu/logs/kv-n12.stderr.log
--remain-after-exit --unit kv-n12 -- ./cockroach workload run kv --read-percent 0
--max-rate 400 --concurrency 1000 --min-block-bytes 5000 --max-block-bytes 5000 --tolerate-errors {pgurl:1-2}`
c.Run(ctx, c.Node(4), strings.Fields(deployWorkload)...)
}
if cfg.kv50N3 {
// On n3, we run a "trickle" workload that does not add much work to the
// system but which we can use to establish to monitor the impact of the
// overload on the follower to its foreground traffic. All leases for this
// workload are held by n3.
const deployWorkload = `
sudo systemd-run --property=Type=exec
--property=StandardOutput=file:/home/ubuntu/logs/kv-n3.stdout.log
--property=StandardError=file:/home/ubuntu/logs/kv-n3.stderr.log
--remain-after-exit --unit kv-n3 -- ./cockroach workload run kv --db kvn3
--read-percent 50 --max-rate 100 --concurrency 1000 --min-block-bytes 100 --max-block-bytes 100
--prometheus-port 2113 --tolerate-errors {pgurl:3}`
c.Run(ctx, c.Node(4), strings.Fields(deployWorkload)...)
}
t.L().Printf("deployed workload")

wait(c.NewMonitor(ctx, nodes), phaseDuration)

if cfg.ioNemesis {
// Add 1000MiB/s of "badput", i.e. IO nemesis. These are pure writes. This
// doesn't block and will keep running even after the test tears down.
// 1000MiB/s should be enough to overwhelm any provisioned storage.
const deployFIO = `
sudo systemd-run --property=Type=exec --remain-after-exit --unit fio-n3 -- fio --rw=write --name=test --direct=1 --bs=1024k --size 1mb
--ioengine=libaio --iodepth=4 --directory={store-dir}/auxiliary --rate=0,1000m --time_based --timeout 2400h
`
c.Run(ctx, c.Node(3), strings.Fields(deployFIO)...)
t.L().Printf("deployed fio")
}

wait(c.NewMonitor(ctx, nodes), phaseDuration)

// TODO collect, assert on, and export metrics, using:
// https://github.com/cockroachdb/cockroach/pull/80724.
// Things to check:
// - LSM health of follower (and, to be sure, on other replicas)
// -Latency of a benign read-only workload on the follower
// - Comparison of baseline perf of kv0 workload before disk nemesis (i.e.
// run first without nemesis, then with nemesis, maybe again without, make
// sure they're all sort of comparable, or report all three, or something
// like that. At first probably just export the overall coefficient of
// variation or something like that and leave detailed interpretation to
// human eyes on roachperf.
t.Fatal("failing on purpose")
}

func wait(m cluster.Monitor, duration time.Duration) {
m.Go(func(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(duration):
return nil
}
})
m.Wait()
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
func RegisterTests(r registry.Registry) {
registerAcceptance(r)
registerActiveRecord(r)
registerAdmissionControlFollowerOverload(r)
registerAllocator(r)
registerAlterPK(r)
registerAWSDMS(r)
Expand Down

0 comments on commit 4b9497d

Please sign in to comment.