Merge cockroachdb#80230

80230: roachtest: remove network/* family of toxiproxy tests r=srosenberg a=tbg These were originally introduced in cockroachdb#33282 in response to a resource leak (goroutine buildup) under prolonged network partitions. However this type of testing never caught on and the meaningful part of these tests has been skipped for years at this point, with little hope for a comeback. It's not impossible that we may consider toxiproxy again in the future and I'm not generically against that. When and if we do, the git history will still contain bits and pieces that we can salvage. Release note: None Co-authored-by: Tobias Grieger <[email protected]>
xinhaoz · Apr 25, 2022 · a1669f1 · a1669f1
2 parents 8fe42c1 + 4136114
commit a1669f1
Show file tree

Hide file tree

Showing 5 changed files with 1 addition and 539 deletions.
diff --git a/go.mod b/go.mod
@@ -16,7 +16,6 @@ require (
 	github.com/MichaelTJones/walk v0.0.0-20161122175330-4748e29d5718
 	github.com/PuerkitoBio/goquery v1.5.1
 	github.com/Shopify/sarama v1.29.0
-	github.com/Shopify/toxiproxy v2.1.4+incompatible
 	github.com/VividCortex/ewma v1.1.1
 	github.com/abourget/teamcity v0.0.0-00010101000000-000000000000
 	github.com/alessio/shellescape v1.4.1

diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel
@@ -119,7 +119,6 @@ go_library(
         "synctest.go",
         "sysbench.go",
         "tlp.go",
-        "toxiproxy.go",
         "tpc_utils.go",
         "tpcc.go",
         "tpcdsvec.go",
@@ -213,7 +212,6 @@ go_library(
         "@com_github_prometheus_client_golang//api/prometheus/v1:prometheus",
         "@com_github_prometheus_common//model",
         "@com_github_shopify_sarama//:sarama",
-        "@com_github_shopify_toxiproxy//client",
         "@com_github_stretchr_testify//require",
         "@org_golang_google_protobuf//proto",
         "@org_golang_x_exp//rand",

diff --git a/pkg/cmd/roachtest/tests/network.go b/pkg/cmd/roachtest/tests/network.go
@@ -11,103 +11,25 @@
 package tests
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"sync/atomic"
 	"time"
 
-	toxiproxy "github.com/Shopify/toxiproxy/client"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
-	"github.com/cockroachdb/cockroach/pkg/util/httputil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	_ "github.com/lib/pq" // register postgres driver
 	"github.com/stretchr/testify/require"
 )
 
-// runNetworkSanity is just a sanity check to make sure we're setting up toxiproxy
-// correctly. It injects latency between the nodes and verifies that we're not
-// seeing the latency on the client connection running `SELECT 1` on each node.
-func runNetworkSanity(ctx context.Context, t test.Test, origC cluster.Cluster, nodes int) {
-	origC.Put(ctx, t.Cockroach(), "./cockroach", origC.All())
-	c, err := Toxify(ctx, t, origC, origC.All())
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())
-
-	db := c.Conn(ctx, t.L(), 1) // unaffected by toxiproxy
-	defer db.Close()
-	err = WaitFor3XReplication(ctx, t, db)
-	require.NoError(t, err)
-
-	// NB: we're generous with latency in this test because we're checking that
-	// the upstream connections aren't affected by latency below, but the fixed
-	// cost of starting the binary and processing the query is already close to
-	// 100ms.
-	const latency = 300 * time.Millisecond
-	for i := 1; i <= nodes; i++ {
-		// NB: note that these latencies only apply to connections *to* the node
-		// on which the toxic is active. That is, if n1 has a (down or upstream)
-		// latency toxic of 100ms, then none of its outbound connections are
-		// affected but any connections made to it by other nodes will.
-		// In particular, it's difficult to simulate intricate network partitions
-		// as there's no way to activate toxics only for certain peers.
-		proxy := c.Proxy(i)
-		if _, err := proxy.AddToxic("", "latency", "downstream", 1, toxiproxy.Attributes{
-			"latency": latency / (2 * time.Millisecond), // ms
-		}); err != nil {
-			t.Fatal(err)
-		}
-		if _, err := proxy.AddToxic("", "latency", "upstream", 1, toxiproxy.Attributes{
-			"latency": latency / (2 * time.Millisecond), // ms
-		}); err != nil {
-			t.Fatal(err)
-		}
-	}
-
-	m := c.Cluster.NewMonitor(ctx, c.All())
-	m.Go(func(ctx context.Context) error {
-		c.Measure(ctx, 1, `SET CLUSTER SETTING trace.debug.enable = true`)
-		c.Measure(ctx, 1, "CREATE DATABASE test")
-		c.Measure(ctx, 1, `CREATE TABLE test.commit (a INT, b INT, v INT, PRIMARY KEY (a, b))`)
-
-		for i := 0; i < 10; i++ {
-			duration := c.Measure(ctx, 1, fmt.Sprintf(
-				"BEGIN; INSERT INTO test.commit VALUES (2, %[1]d), (1, %[1]d), (3, %[1]d); COMMIT",
-				i,
-			))
-			t.L().Printf("%s\n", duration)
-		}
-
-		c.Measure(ctx, 1, `
-set tracing=on;
-insert into test.commit values(3,1000), (1,1000), (2,1000);
-select age, message from [ show trace for session ];
-`)
-
-		for i := 1; i <= origC.Spec().NodeCount; i++ {
-			if dur := c.Measure(ctx, i, `SELECT 1`); dur > latency {
-				t.Fatalf("node %d unexpectedly affected by latency: select 1 took %.2fs", i, dur.Seconds())
-			}
-		}
-
-		return nil
-	})
-
-	m.Wait()
-}
-
 // runNetworkAuthentication creates a network black hole to the leaseholder
 // of system.users, and then validates that the time required to create
 // new connections to the cluster afterwards remains under a reasonable limit.
@@ -367,156 +289,8 @@ sudo iptables-save
 	m.Wait()
 }
 
-func runNetworkTPCC(ctx context.Context, t test.Test, origC cluster.Cluster, nodes int) {
-	n := origC.Spec().NodeCount
-	serverNodes, workerNode := origC.Range(1, n-1), origC.Node(n)
-	origC.Put(ctx, t.Cockroach(), "./cockroach", origC.All())
-	origC.Put(ctx, t.DeprecatedWorkload(), "./workload", origC.All())
-
-	c, err := Toxify(ctx, t, origC, serverNodes)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	const warehouses = 1
-	c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), serverNodes)
-	c.Run(ctx, c.Node(1), tpccImportCmd(warehouses))
-
-	db := c.Conn(ctx, t.L(), 1)
-	defer db.Close()
-	err = WaitFor3XReplication(ctx, t, db)
-	require.NoError(t, err)
-
-	duration := time.Hour
-	if c.IsLocal() {
-		// NB: this is really just testing the test with this duration, it won't
-		// be able to detect slow goroutine leaks.
-		duration = 5 * time.Minute
-	}
-
-	// Run TPCC, but don't give it the first node (or it basically won't do anything).
-	m := c.NewMonitor(ctx, serverNodes)
-
-	m.Go(func(ctx context.Context) error {
-		t.WorkerStatus("running tpcc")
-
-		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --wait=false"+
-				" --histograms="+t.PerfArtifactsDir()+"/stats.json"+
-				" --duration=%s {pgurl:2-%d}",
-			warehouses, duration, c.Spec().NodeCount-1)
-		return c.RunE(ctx, workerNode, cmd)
-	})
-
-	checkGoroutines := func(ctx context.Context) int {
-		// NB: at the time of writing, the goroutine count would quickly
-		// stabilize near 230 when the network is partitioned, and around 270
-		// when it isn't. Experimentally a past "slow" goroutine leak leaked ~3
-		// goroutines every minute (though it would likely be more with the tpcc
-		// workload above), which over the duration of an hour would easily push
-		// us over the threshold.
-		const thresh = 350
-
-		uiAddrs, err := c.ExternalAdminUIAddr(ctx, t.L(), serverNodes)
-		if err != nil {
-			t.Fatal(err)
-		}
-		var maxSeen int
-		// The goroutine dump may take a while to generate, maybe more
-		// than the 3 second timeout of the default http client.
-		httpClient := httputil.NewClientWithTimeout(15 * time.Second)
-		for _, addr := range uiAddrs {
-			url := "http://" + addr + "/debug/pprof/goroutine?debug=2"
-			resp, err := httpClient.Get(ctx, url)
-			if err != nil {
-				t.Fatal(err)
-			}
-			content, err := ioutil.ReadAll(resp.Body)
-			resp.Body.Close()
-			if err != nil {
-				t.Fatal(err)
-			}
-			numGoroutines := bytes.Count(content, []byte("goroutine "))
-			if numGoroutines >= thresh {
-				t.Fatalf("%s shows %d goroutines (expected <%d)", url, numGoroutines, thresh)
-			}
-			if maxSeen < numGoroutines {
-				maxSeen = numGoroutines
-			}
-		}
-		return maxSeen
-	}
-
-	m.Go(func(ctx context.Context) error {
-		time.Sleep(10 * time.Second) // give tpcc a head start
-		// Give n1 a network partition from the remainder of the cluster. Note that even though it affects
-		// both the "upstream" and "downstream" directions, this is in fact an asymmetric partition since
-		// it only affects connections *to* the node. n1 itself can connect to the cluster just fine.
-		proxy := c.Proxy(1)
-		t.L().Printf("letting inbound traffic to first node time out")
-		for _, direction := range []string{"upstream", "downstream"} {
-			if _, err := proxy.AddToxic("", "timeout", direction, 1, toxiproxy.Attributes{
-				"timeout": 0, // forever
-			}); err != nil {
-				t.Fatal(err)
-			}
-		}
-
-		t.WorkerStatus("checking goroutines")
-		done := time.After(duration)
-		var maxSeen int
-		for {
-			cur := checkGoroutines(ctx)
-			if maxSeen < cur {
-				t.L().Printf("new goroutine peak: %d", cur)
-				maxSeen = cur
-			}
-
-			select {
-			case <-done:
-				t.L().Printf("done checking goroutines, repairing network")
-				// Repair the network. Note that the TPCC workload would never
-				// finish (despite the duration) without this. In particular,
-				// we don't want to m.Wait() before we do this.
-				toxics, err := proxy.Toxics()
-				if err != nil {
-					t.Fatal(err)
-				}
-				for _, toxic := range toxics {
-					if err := proxy.RemoveToxic(toxic.Name); err != nil {
-						t.Fatal(err)
-					}
-				}
-				t.L().Printf("network is repaired")
-
-				// Verify that goroutine count doesn't spike.
-				for i := 0; i < 20; i++ {
-					nowGoroutines := checkGoroutines(ctx)
-					t.L().Printf("currently at most %d goroutines per node", nowGoroutines)
-					time.Sleep(time.Second)
-				}
-
-				return nil
-			default:
-				time.Sleep(3 * time.Second)
-			}
-		}
-	})
-
-	m.Wait()
-}
-
 func registerNetwork(r registry.Registry) {
 	const numNodes = 4
-
-	r.Add(registry.TestSpec{
-		Name:    fmt.Sprintf("network/sanity/nodes=%d", numNodes),
-		Owner:   registry.OwnerKV,
-		Cluster: r.MakeClusterSpec(numNodes),
-		Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
-			runNetworkSanity(ctx, t, c, numNodes)
-		},
-	})
 	r.Add(registry.TestSpec{
 		Name:    fmt.Sprintf("network/authentication/nodes=%d", numNodes),
 		Owner:   registry.OwnerServer,
@@ -525,34 +299,4 @@ func registerNetwork(r registry.Registry) {
 			runNetworkAuthentication(ctx, t, c)
 		},
 	})
-	r.Add(registry.TestSpec{
-		Name:    fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
-		Owner:   registry.OwnerKV,
-		Cluster: r.MakeClusterSpec(numNodes),
-		Skip:    "https://github.com/cockroachdb/cockroach/issues/49901#issuecomment-640666646",
-		SkipDetails: `The ordering of steps in the test is:
-
-- install toxiproxy
-- start cluster, wait for up-replication
-- launch the goroutine that starts the tpcc client command, but do not wait on
-it starting
-- immediately, cause a network partition
-- only then, the goroutine meant to start the tpcc client goes to fetch the
-pg URLs and start workload, but of course this fails because network
-partition
-- tpcc fails to start, so the test tears down before it resolves the network partition
-- test tear-down and debug zip fail because the network partition is still active
-
-There are two problems here:
-
-the tpcc client is not actually started yet when the test sets up the
-network partition. This is a race condition. there should be a defer in
-there to resolve the partition when the test aborts prematurely. (And the
-command to resolve the partition should not be sensitive to the test
-context's Done() channel, because during a tear-down that is closed already)
-`,
-		Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
-			runNetworkTPCC(ctx, t, c, numNodes)
-		},
-	})
 }