Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
80230: roachtest: remove network/* family of toxiproxy tests r=srosenberg a=tbg

These were originally introduced in cockroachdb#33282 in response to a resource leak
(goroutine buildup) under prolonged network partitions. However this type of
testing never caught on and the meaningful part of these tests has been skipped
for years at this point, with little hope for a comeback.

It's not impossible that we may consider toxiproxy again in the future and I'm
not generically against that. When and if we do, the git history will still
contain bits and pieces that we can salvage.

Release note: None


Co-authored-by: Tobias Grieger <[email protected]>
  • Loading branch information
craig[bot] and tbg committed Apr 25, 2022
2 parents 8fe42c1 + 4136114 commit a1669f1
Show file tree
Hide file tree
Showing 5 changed files with 1 addition and 539 deletions.
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ require (
github.com/MichaelTJones/walk v0.0.0-20161122175330-4748e29d5718
github.com/PuerkitoBio/goquery v1.5.1
github.com/Shopify/sarama v1.29.0
github.com/Shopify/toxiproxy v2.1.4+incompatible
github.com/VividCortex/ewma v1.1.1
github.com/abourget/teamcity v0.0.0-00010101000000-000000000000
github.com/alessio/shellescape v1.4.1
Expand Down
2 changes: 0 additions & 2 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ go_library(
"synctest.go",
"sysbench.go",
"tlp.go",
"toxiproxy.go",
"tpc_utils.go",
"tpcc.go",
"tpcdsvec.go",
Expand Down Expand Up @@ -213,7 +212,6 @@ go_library(
"@com_github_prometheus_client_golang//api/prometheus/v1:prometheus",
"@com_github_prometheus_common//model",
"@com_github_shopify_sarama//:sarama",
"@com_github_shopify_toxiproxy//client",
"@com_github_stretchr_testify//require",
"@org_golang_google_protobuf//proto",
"@org_golang_x_exp//rand",
Expand Down
256 changes: 0 additions & 256 deletions pkg/cmd/roachtest/tests/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,103 +11,25 @@
package tests

import (
"bytes"
"context"
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"sync/atomic"
"time"

toxiproxy "github.com/Shopify/toxiproxy/client"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
_ "github.com/lib/pq" // register postgres driver
"github.com/stretchr/testify/require"
)

// runNetworkSanity is just a sanity check to make sure we're setting up toxiproxy
// correctly. It injects latency between the nodes and verifies that we're not
// seeing the latency on the client connection running `SELECT 1` on each node.
func runNetworkSanity(ctx context.Context, t test.Test, origC cluster.Cluster, nodes int) {
origC.Put(ctx, t.Cockroach(), "./cockroach", origC.All())
c, err := Toxify(ctx, t, origC, origC.All())
if err != nil {
t.Fatal(err)
}

c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())

db := c.Conn(ctx, t.L(), 1) // unaffected by toxiproxy
defer db.Close()
err = WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)

// NB: we're generous with latency in this test because we're checking that
// the upstream connections aren't affected by latency below, but the fixed
// cost of starting the binary and processing the query is already close to
// 100ms.
const latency = 300 * time.Millisecond
for i := 1; i <= nodes; i++ {
// NB: note that these latencies only apply to connections *to* the node
// on which the toxic is active. That is, if n1 has a (down or upstream)
// latency toxic of 100ms, then none of its outbound connections are
// affected but any connections made to it by other nodes will.
// In particular, it's difficult to simulate intricate network partitions
// as there's no way to activate toxics only for certain peers.
proxy := c.Proxy(i)
if _, err := proxy.AddToxic("", "latency", "downstream", 1, toxiproxy.Attributes{
"latency": latency / (2 * time.Millisecond), // ms
}); err != nil {
t.Fatal(err)
}
if _, err := proxy.AddToxic("", "latency", "upstream", 1, toxiproxy.Attributes{
"latency": latency / (2 * time.Millisecond), // ms
}); err != nil {
t.Fatal(err)
}
}

m := c.Cluster.NewMonitor(ctx, c.All())
m.Go(func(ctx context.Context) error {
c.Measure(ctx, 1, `SET CLUSTER SETTING trace.debug.enable = true`)
c.Measure(ctx, 1, "CREATE DATABASE test")
c.Measure(ctx, 1, `CREATE TABLE test.commit (a INT, b INT, v INT, PRIMARY KEY (a, b))`)

for i := 0; i < 10; i++ {
duration := c.Measure(ctx, 1, fmt.Sprintf(
"BEGIN; INSERT INTO test.commit VALUES (2, %[1]d), (1, %[1]d), (3, %[1]d); COMMIT",
i,
))
t.L().Printf("%s\n", duration)
}

c.Measure(ctx, 1, `
set tracing=on;
insert into test.commit values(3,1000), (1,1000), (2,1000);
select age, message from [ show trace for session ];
`)

for i := 1; i <= origC.Spec().NodeCount; i++ {
if dur := c.Measure(ctx, i, `SELECT 1`); dur > latency {
t.Fatalf("node %d unexpectedly affected by latency: select 1 took %.2fs", i, dur.Seconds())
}
}

return nil
})

m.Wait()
}

// runNetworkAuthentication creates a network black hole to the leaseholder
// of system.users, and then validates that the time required to create
// new connections to the cluster afterwards remains under a reasonable limit.
Expand Down Expand Up @@ -367,156 +289,8 @@ sudo iptables-save
m.Wait()
}

func runNetworkTPCC(ctx context.Context, t test.Test, origC cluster.Cluster, nodes int) {
n := origC.Spec().NodeCount
serverNodes, workerNode := origC.Range(1, n-1), origC.Node(n)
origC.Put(ctx, t.Cockroach(), "./cockroach", origC.All())
origC.Put(ctx, t.DeprecatedWorkload(), "./workload", origC.All())

c, err := Toxify(ctx, t, origC, serverNodes)
if err != nil {
t.Fatal(err)
}

const warehouses = 1
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), serverNodes)
c.Run(ctx, c.Node(1), tpccImportCmd(warehouses))

db := c.Conn(ctx, t.L(), 1)
defer db.Close()
err = WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)

duration := time.Hour
if c.IsLocal() {
// NB: this is really just testing the test with this duration, it won't
// be able to detect slow goroutine leaks.
duration = 5 * time.Minute
}

// Run TPCC, but don't give it the first node (or it basically won't do anything).
m := c.NewMonitor(ctx, serverNodes)

m.Go(func(ctx context.Context) error {
t.WorkerStatus("running tpcc")

cmd := fmt.Sprintf(
"./workload run tpcc --warehouses=%d --wait=false"+
" --histograms="+t.PerfArtifactsDir()+"/stats.json"+
" --duration=%s {pgurl:2-%d}",
warehouses, duration, c.Spec().NodeCount-1)
return c.RunE(ctx, workerNode, cmd)
})

checkGoroutines := func(ctx context.Context) int {
// NB: at the time of writing, the goroutine count would quickly
// stabilize near 230 when the network is partitioned, and around 270
// when it isn't. Experimentally a past "slow" goroutine leak leaked ~3
// goroutines every minute (though it would likely be more with the tpcc
// workload above), which over the duration of an hour would easily push
// us over the threshold.
const thresh = 350

uiAddrs, err := c.ExternalAdminUIAddr(ctx, t.L(), serverNodes)
if err != nil {
t.Fatal(err)
}
var maxSeen int
// The goroutine dump may take a while to generate, maybe more
// than the 3 second timeout of the default http client.
httpClient := httputil.NewClientWithTimeout(15 * time.Second)
for _, addr := range uiAddrs {
url := "http://" + addr + "/debug/pprof/goroutine?debug=2"
resp, err := httpClient.Get(ctx, url)
if err != nil {
t.Fatal(err)
}
content, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
t.Fatal(err)
}
numGoroutines := bytes.Count(content, []byte("goroutine "))
if numGoroutines >= thresh {
t.Fatalf("%s shows %d goroutines (expected <%d)", url, numGoroutines, thresh)
}
if maxSeen < numGoroutines {
maxSeen = numGoroutines
}
}
return maxSeen
}

m.Go(func(ctx context.Context) error {
time.Sleep(10 * time.Second) // give tpcc a head start
// Give n1 a network partition from the remainder of the cluster. Note that even though it affects
// both the "upstream" and "downstream" directions, this is in fact an asymmetric partition since
// it only affects connections *to* the node. n1 itself can connect to the cluster just fine.
proxy := c.Proxy(1)
t.L().Printf("letting inbound traffic to first node time out")
for _, direction := range []string{"upstream", "downstream"} {
if _, err := proxy.AddToxic("", "timeout", direction, 1, toxiproxy.Attributes{
"timeout": 0, // forever
}); err != nil {
t.Fatal(err)
}
}

t.WorkerStatus("checking goroutines")
done := time.After(duration)
var maxSeen int
for {
cur := checkGoroutines(ctx)
if maxSeen < cur {
t.L().Printf("new goroutine peak: %d", cur)
maxSeen = cur
}

select {
case <-done:
t.L().Printf("done checking goroutines, repairing network")
// Repair the network. Note that the TPCC workload would never
// finish (despite the duration) without this. In particular,
// we don't want to m.Wait() before we do this.
toxics, err := proxy.Toxics()
if err != nil {
t.Fatal(err)
}
for _, toxic := range toxics {
if err := proxy.RemoveToxic(toxic.Name); err != nil {
t.Fatal(err)
}
}
t.L().Printf("network is repaired")

// Verify that goroutine count doesn't spike.
for i := 0; i < 20; i++ {
nowGoroutines := checkGoroutines(ctx)
t.L().Printf("currently at most %d goroutines per node", nowGoroutines)
time.Sleep(time.Second)
}

return nil
default:
time.Sleep(3 * time.Second)
}
}
})

m.Wait()
}

func registerNetwork(r registry.Registry) {
const numNodes = 4

r.Add(registry.TestSpec{
Name: fmt.Sprintf("network/sanity/nodes=%d", numNodes),
Owner: registry.OwnerKV,
Cluster: r.MakeClusterSpec(numNodes),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runNetworkSanity(ctx, t, c, numNodes)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("network/authentication/nodes=%d", numNodes),
Owner: registry.OwnerServer,
Expand All @@ -525,34 +299,4 @@ func registerNetwork(r registry.Registry) {
runNetworkAuthentication(ctx, t, c)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
Owner: registry.OwnerKV,
Cluster: r.MakeClusterSpec(numNodes),
Skip: "https://github.com/cockroachdb/cockroach/issues/49901#issuecomment-640666646",
SkipDetails: `The ordering of steps in the test is:
- install toxiproxy
- start cluster, wait for up-replication
- launch the goroutine that starts the tpcc client command, but do not wait on
it starting
- immediately, cause a network partition
- only then, the goroutine meant to start the tpcc client goes to fetch the
pg URLs and start workload, but of course this fails because network
partition
- tpcc fails to start, so the test tears down before it resolves the network partition
- test tear-down and debug zip fail because the network partition is still active
There are two problems here:
the tpcc client is not actually started yet when the test sets up the
network partition. This is a race condition. there should be a defer in
there to resolve the partition when the test aborts prematurely. (And the
command to resolve the partition should not be sensitive to the test
context's Done() channel, because during a tear-down that is closed already)
`,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runNetworkTPCC(ctx, t, c, numNodes)
},
})
}
Loading

0 comments on commit a1669f1

Please sign in to comment.