From c6badb4c4677fa1947722b246ef1ad658e9ec85f Mon Sep 17 00:00:00 2001
From: Steven Danna <danna@cockroachlabs.com>
Date: Fri, 28 Oct 2022 11:12:56 +0100
Subject: [PATCH] roachtest: add --debug-always flag

Occasionally, it is very useful to keep a cluster around even if the
workload happened to complete without error. The --debug-always is
like --debug but saves the cluster even if the test was successful.

Release note: None
---
 pkg/cmd/roachtest/main.go        | 39 +++++++++++++++++++++---------
 pkg/cmd/roachtest/test_runner.go | 41 +++++++++++++++++++++++++-------
 pkg/cmd/roachtest/test_test.go   | 16 ++++++-------
 3 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go
index 34f6c4590d4a..cac9809cccc9 100644
--- a/pkg/cmd/roachtest/main.go
+++ b/pkg/cmd/roachtest/main.go
@@ -84,7 +84,8 @@ func main() {
 	// ##teamcity[publishArtifacts] in Teamcity mode.
 	var literalArtifacts string
 	var httpPort int
-	var debugEnabled bool
+	var debugOnFailure bool
+	var debugAlways bool
 	var runSkipped bool
 	var skipInit bool
 	var clusterID string
@@ -93,6 +94,16 @@ func main() {
 
 	cobra.EnableCommandSorting = false
 
+	debugModeFromOpts := func() debugMode {
+		if debugAlways {
+			return DebugKeepAlways
+		}
+		if debugOnFailure {
+			return DebugKeepOnFailure
+		}
+		return NoDebug
+	}
+
 	var rootCmd = &cobra.Command{
 		Use:   "roachtest [command] (flags)",
 		Short: "roachtest tool for testing cockroach clusters",
@@ -222,8 +233,8 @@ runner itself.
 				args:                   args,
 				count:                  count,
 				cpuQuota:               cpuQuota,
-				debugEnabled:           debugEnabled,
 				runSkipped:             runSkipped,
+				debugMode:              debugModeFromOpts(),
 				skipInit:               skipInit,
 				httpPort:               httpPort,
 				parallelism:            parallelism,
@@ -263,8 +274,8 @@ runner itself.
 				args:                   args,
 				count:                  count,
 				cpuQuota:               cpuQuota,
-				debugEnabled:           debugEnabled,
 				runSkipped:             runSkipped,
+				debugMode:              debugModeFromOpts(),
 				skipInit:               skipInit,
 				httpPort:               httpPort,
 				parallelism:            parallelism,
@@ -289,7 +300,9 @@ runner itself.
 		cmd.Flags().IntVar(
 			&count, "count", 1, "the number of times to run each test")
 		cmd.Flags().BoolVarP(
-			&debugEnabled, "debug", "d", debugEnabled, "don't wipe and destroy cluster if test fails")
+			&debugOnFailure, "debug", "d", debugOnFailure, "don't wipe and destroy cluster if test fails")
+		cmd.Flags().BoolVar(
+			&debugAlways, "debug-always", debugAlways, "never wipe and destroy the cluster")
 		cmd.Flags().BoolVar(
 			&runSkipped, "run-skipped", runSkipped, "run skipped tests")
 		cmd.Flags().BoolVar(
@@ -360,7 +373,7 @@ type cliCfg struct {
 	args                   []string
 	count                  int
 	cpuQuota               int
-	debugEnabled           bool
+	debugMode              debugMode
 	runSkipped             bool
 	skipInit               bool
 	httpPort               int
@@ -404,12 +417,12 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {
 	}
 
 	opt := clustersOpt{
-		typ:                       clusterType,
-		clusterName:               clusterName,
-		user:                      getUser(cfg.user),
-		cpuQuota:                  cfg.cpuQuota,
-		keepClustersOnTestFailure: cfg.debugEnabled,
-		clusterID:                 cfg.clusterID,
+		typ:         clusterType,
+		clusterName: clusterName,
+		user:        getUser(cfg.user),
+		cpuQuota:    cfg.cpuQuota,
+		debugMode:   cfg.debugMode,
+		clusterID:   cfg.clusterID,
 	}
 	if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil {
 		return err
@@ -424,6 +437,10 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {
 		// stdout/stderr.
 		cfg.parallelism = n * cfg.count
 	}
+	if opt.debugMode == DebugKeepAlways && n > 1 {
+		return errors.Newf("--debug-always is only allowed when running a single test")
+	}
+
 	runnerDir := filepath.Join(cfg.artifactsDir, runnerLogsDir)
 	runnerLogPath := filepath.Join(
 		runnerDir, fmt.Sprintf("test_runner-%d.log", timeutil.Now().Unix()))
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 9f6f370f7faa..a2295db0e48c 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -149,10 +149,28 @@ type clustersOpt struct {
 	// test runner will wait for other tests to finish and their cluster to be
 	// destroyed (or reused). Note that this limit is global, not per zone.
 	cpuQuota int
-	// If set, clusters will not be wiped or destroyed when a test using the
-	// respective cluster fails. These cluster will linger around and they'll
-	// continue counting towards the cpuQuota.
-	keepClustersOnTestFailure bool
+
+	// Controls whether the cluster is cleaned up at the end of the test.
+	debugMode debugMode
+}
+
+type debugMode int
+
+const (
+	// NoDebug does not enable any debug behaviour. Clusters will
+	// be destroyed regardless of the test result.
+	NoDebug debugMode = iota
+	// DebugKeepOnFailure does not wipe or destroy a cluster when
+	// a test using the respective cluster fails. These clusters
+	// will linger around and they'll continue counting towards
+	// the cpuQuota.
+	DebugKeepOnFailure
+	// DebugKeepAlways never wipes or destroys a cluster.
+	DebugKeepAlways
+)
+
+func (p debugMode) IsDebug() bool {
+	return p == DebugKeepOnFailure || p == DebugKeepAlways
 }
 
 func (c clustersOpt) validate() error {
@@ -272,7 +290,7 @@ func (r *testRunner) Run(
 			err := r.runWorker(
 				ctx, fmt.Sprintf("w%d", i) /* name */, r.work, qp,
 				r.stopper.ShouldQuiesce(),
-				clustersOpt.keepClustersOnTestFailure,
+				clustersOpt.debugMode,
 				lopt.artifactsDir, lopt.literalArtifactsDir, lopt.tee, lopt.stdout,
 				clusterAllocator,
 				topt,
@@ -449,7 +467,7 @@ func (r *testRunner) runWorker(
 	work *workPool,
 	qp *quotapool.IntPool,
 	interrupt <-chan struct{},
-	debug bool,
+	debugMode debugMode,
 	artifactsRootDir string,
 	literalArtifactsDir string,
 	teeOpt logger.TeeOptType,
@@ -615,7 +633,7 @@ func (r *testRunner) runWorker(
 			l:                      testL,
 			versionsBinaryOverride: topt.versionsBinaryOverride,
 			skipInit:               topt.skipInit,
-			debug:                  debug,
+			debug:                  debugMode.IsDebug(),
 		}
 		// Now run the test.
 		l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum)
@@ -685,13 +703,14 @@ func (r *testRunner) runWorker(
 				failureMsg += t.failureMsg()
 			}
 			if c != nil {
-				if debug {
+				switch debugMode {
+				case DebugKeepAlways, DebugKeepOnFailure:
 					// Save the cluster for future debugging.
 					c.Save(ctx, failureMsg, l)
 
 					// Continue with a fresh cluster.
 					c = nil
-				} else {
+				case NoDebug:
 					// On any test failure or error, we destroy the cluster. We could be
 					// more selective, but this sounds safer.
 					l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg)
@@ -706,6 +725,10 @@ func (r *testRunner) runWorker(
 		} else {
 			// Upon success fetch the perf artifacts from the remote hosts.
 			getPerfArtifacts(ctx, l, c, t)
+			if debugMode == DebugKeepAlways {
+				c.Save(ctx, "cluster saved since --debug-always set", l)
+				c = nil
+			}
 		}
 	}
 }
diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go
index 4a38e9033292..70ae292281db 100644
--- a/pkg/cmd/roachtest/test_test.go
+++ b/pkg/cmd/roachtest/test_test.go
@@ -265,10 +265,10 @@ func setupRunnerTest(t *testing.T, r testRegistryImpl, testFilters []string) *ru
 		artifactsDir: "",
 	}
 	copt := clustersOpt{
-		typ:                       roachprodCluster,
-		user:                      "test_user",
-		cpuQuota:                  1000,
-		keepClustersOnTestFailure: false,
+		typ:       roachprodCluster,
+		user:      "test_user",
+		cpuQuota:  1000,
+		debugMode: NoDebug,
 	}
 	return &runnerTest{
 		stdout: &stdout,
@@ -337,10 +337,10 @@ func TestRunnerTestTimeout(t *testing.T) {
 		artifactsDir: "",
 	}
 	copt := clustersOpt{
-		typ:                       roachprodCluster,
-		user:                      "test_user",
-		cpuQuota:                  1000,
-		keepClustersOnTestFailure: false,
+		typ:       roachprodCluster,
+		user:      "test_user",
+		cpuQuota:  1000,
+		debugMode: NoDebug,
 	}
 	test := registry.TestSpec{
 		Name:    `timeout`,