From c6badb4c4677fa1947722b246ef1ad658e9ec85f Mon Sep 17 00:00:00 2001 From: Steven Danna Date: Fri, 28 Oct 2022 11:12:56 +0100 Subject: [PATCH] roachtest: add --debug-always flag Occasionally, it is very useful to keep a cluster around even if the workload happened to complete without error. The --debug-always is like --debug but saves the cluster even if the test was successful. Release note: None --- pkg/cmd/roachtest/main.go | 39 +++++++++++++++++++++--------- pkg/cmd/roachtest/test_runner.go | 41 +++++++++++++++++++++++++------- pkg/cmd/roachtest/test_test.go | 16 ++++++------- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index 34f6c4590d4a..cac9809cccc9 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -84,7 +84,8 @@ func main() { // ##teamcity[publishArtifacts] in Teamcity mode. var literalArtifacts string var httpPort int - var debugEnabled bool + var debugOnFailure bool + var debugAlways bool var runSkipped bool var skipInit bool var clusterID string @@ -93,6 +94,16 @@ func main() { cobra.EnableCommandSorting = false + debugModeFromOpts := func() debugMode { + if debugAlways { + return DebugKeepAlways + } + if debugOnFailure { + return DebugKeepOnFailure + } + return NoDebug + } + var rootCmd = &cobra.Command{ Use: "roachtest [command] (flags)", Short: "roachtest tool for testing cockroach clusters", @@ -222,8 +233,8 @@ runner itself. args: args, count: count, cpuQuota: cpuQuota, - debugEnabled: debugEnabled, runSkipped: runSkipped, + debugMode: debugModeFromOpts(), skipInit: skipInit, httpPort: httpPort, parallelism: parallelism, @@ -263,8 +274,8 @@ runner itself. args: args, count: count, cpuQuota: cpuQuota, - debugEnabled: debugEnabled, runSkipped: runSkipped, + debugMode: debugModeFromOpts(), skipInit: skipInit, httpPort: httpPort, parallelism: parallelism, @@ -289,7 +300,9 @@ runner itself. cmd.Flags().IntVar( &count, "count", 1, "the number of times to run each test") cmd.Flags().BoolVarP( - &debugEnabled, "debug", "d", debugEnabled, "don't wipe and destroy cluster if test fails") + &debugOnFailure, "debug", "d", debugOnFailure, "don't wipe and destroy cluster if test fails") + cmd.Flags().BoolVar( + &debugAlways, "debug-always", debugAlways, "never wipe and destroy the cluster") cmd.Flags().BoolVar( &runSkipped, "run-skipped", runSkipped, "run skipped tests") cmd.Flags().BoolVar( @@ -360,7 +373,7 @@ type cliCfg struct { args []string count int cpuQuota int - debugEnabled bool + debugMode debugMode runSkipped bool skipInit bool httpPort int @@ -404,12 +417,12 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { } opt := clustersOpt{ - typ: clusterType, - clusterName: clusterName, - user: getUser(cfg.user), - cpuQuota: cfg.cpuQuota, - keepClustersOnTestFailure: cfg.debugEnabled, - clusterID: cfg.clusterID, + typ: clusterType, + clusterName: clusterName, + user: getUser(cfg.user), + cpuQuota: cfg.cpuQuota, + debugMode: cfg.debugMode, + clusterID: cfg.clusterID, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err @@ -424,6 +437,10 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { // stdout/stderr. cfg.parallelism = n * cfg.count } + if opt.debugMode == DebugKeepAlways && n > 1 { + return errors.Newf("--debug-always is only allowed when running a single test") + } + runnerDir := filepath.Join(cfg.artifactsDir, runnerLogsDir) runnerLogPath := filepath.Join( runnerDir, fmt.Sprintf("test_runner-%d.log", timeutil.Now().Unix())) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 9f6f370f7faa..a2295db0e48c 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -149,10 +149,28 @@ type clustersOpt struct { // test runner will wait for other tests to finish and their cluster to be // destroyed (or reused). Note that this limit is global, not per zone. cpuQuota int - // If set, clusters will not be wiped or destroyed when a test using the - // respective cluster fails. These cluster will linger around and they'll - // continue counting towards the cpuQuota. - keepClustersOnTestFailure bool + + // Controls whether the cluster is cleaned up at the end of the test. + debugMode debugMode +} + +type debugMode int + +const ( + // NoDebug does not enable any debug behaviour. Clusters will + // be destroyed regardless of the test result. + NoDebug debugMode = iota + // DebugKeepOnFailure does not wipe or destroy a cluster when + // a test using the respective cluster fails. These clusters + // will linger around and they'll continue counting towards + // the cpuQuota. + DebugKeepOnFailure + // DebugKeepAlways never wipes or destroys a cluster. + DebugKeepAlways +) + +func (p debugMode) IsDebug() bool { + return p == DebugKeepOnFailure || p == DebugKeepAlways } func (c clustersOpt) validate() error { @@ -272,7 +290,7 @@ func (r *testRunner) Run( err := r.runWorker( ctx, fmt.Sprintf("w%d", i) /* name */, r.work, qp, r.stopper.ShouldQuiesce(), - clustersOpt.keepClustersOnTestFailure, + clustersOpt.debugMode, lopt.artifactsDir, lopt.literalArtifactsDir, lopt.tee, lopt.stdout, clusterAllocator, topt, @@ -449,7 +467,7 @@ func (r *testRunner) runWorker( work *workPool, qp *quotapool.IntPool, interrupt <-chan struct{}, - debug bool, + debugMode debugMode, artifactsRootDir string, literalArtifactsDir string, teeOpt logger.TeeOptType, @@ -615,7 +633,7 @@ func (r *testRunner) runWorker( l: testL, versionsBinaryOverride: topt.versionsBinaryOverride, skipInit: topt.skipInit, - debug: debug, + debug: debugMode.IsDebug(), } // Now run the test. l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum) @@ -685,13 +703,14 @@ func (r *testRunner) runWorker( failureMsg += t.failureMsg() } if c != nil { - if debug { + switch debugMode { + case DebugKeepAlways, DebugKeepOnFailure: // Save the cluster for future debugging. c.Save(ctx, failureMsg, l) // Continue with a fresh cluster. c = nil - } else { + case NoDebug: // On any test failure or error, we destroy the cluster. We could be // more selective, but this sounds safer. l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg) @@ -706,6 +725,10 @@ func (r *testRunner) runWorker( } else { // Upon success fetch the perf artifacts from the remote hosts. getPerfArtifacts(ctx, l, c, t) + if debugMode == DebugKeepAlways { + c.Save(ctx, "cluster saved since --debug-always set", l) + c = nil + } } } } diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index 4a38e9033292..70ae292281db 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -265,10 +265,10 @@ func setupRunnerTest(t *testing.T, r testRegistryImpl, testFilters []string) *ru artifactsDir: "", } copt := clustersOpt{ - typ: roachprodCluster, - user: "test_user", - cpuQuota: 1000, - keepClustersOnTestFailure: false, + typ: roachprodCluster, + user: "test_user", + cpuQuota: 1000, + debugMode: NoDebug, } return &runnerTest{ stdout: &stdout, @@ -337,10 +337,10 @@ func TestRunnerTestTimeout(t *testing.T) { artifactsDir: "", } copt := clustersOpt{ - typ: roachprodCluster, - user: "test_user", - cpuQuota: 1000, - keepClustersOnTestFailure: false, + typ: roachprodCluster, + user: "test_user", + cpuQuota: 1000, + debugMode: NoDebug, } test := registry.TestSpec{ Name: `timeout`,