Skip to content

Commit

Permalink
roachtest: add --debug-always flag
Browse files Browse the repository at this point in the history
Occasionally, it is very useful to keep a cluster around even if the
workload happened to complete without error. The --debug-always is
like --debug but saves the cluster even if the test was successful.

Release note: None
  • Loading branch information
stevendanna committed Nov 14, 2022
1 parent f82dead commit c6badb4
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 28 deletions.
39 changes: 28 additions & 11 deletions pkg/cmd/roachtest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ func main() {
// ##teamcity[publishArtifacts] in Teamcity mode.
var literalArtifacts string
var httpPort int
var debugEnabled bool
var debugOnFailure bool
var debugAlways bool
var runSkipped bool
var skipInit bool
var clusterID string
Expand All @@ -93,6 +94,16 @@ func main() {

cobra.EnableCommandSorting = false

debugModeFromOpts := func() debugMode {
if debugAlways {
return DebugKeepAlways
}
if debugOnFailure {
return DebugKeepOnFailure
}
return NoDebug
}

var rootCmd = &cobra.Command{
Use: "roachtest [command] (flags)",
Short: "roachtest tool for testing cockroach clusters",
Expand Down Expand Up @@ -222,8 +233,8 @@ runner itself.
args: args,
count: count,
cpuQuota: cpuQuota,
debugEnabled: debugEnabled,
runSkipped: runSkipped,
debugMode: debugModeFromOpts(),
skipInit: skipInit,
httpPort: httpPort,
parallelism: parallelism,
Expand Down Expand Up @@ -263,8 +274,8 @@ runner itself.
args: args,
count: count,
cpuQuota: cpuQuota,
debugEnabled: debugEnabled,
runSkipped: runSkipped,
debugMode: debugModeFromOpts(),
skipInit: skipInit,
httpPort: httpPort,
parallelism: parallelism,
Expand All @@ -289,7 +300,9 @@ runner itself.
cmd.Flags().IntVar(
&count, "count", 1, "the number of times to run each test")
cmd.Flags().BoolVarP(
&debugEnabled, "debug", "d", debugEnabled, "don't wipe and destroy cluster if test fails")
&debugOnFailure, "debug", "d", debugOnFailure, "don't wipe and destroy cluster if test fails")
cmd.Flags().BoolVar(
&debugAlways, "debug-always", debugAlways, "never wipe and destroy the cluster")
cmd.Flags().BoolVar(
&runSkipped, "run-skipped", runSkipped, "run skipped tests")
cmd.Flags().BoolVar(
Expand Down Expand Up @@ -360,7 +373,7 @@ type cliCfg struct {
args []string
count int
cpuQuota int
debugEnabled bool
debugMode debugMode
runSkipped bool
skipInit bool
httpPort int
Expand Down Expand Up @@ -404,12 +417,12 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {
}

opt := clustersOpt{
typ: clusterType,
clusterName: clusterName,
user: getUser(cfg.user),
cpuQuota: cfg.cpuQuota,
keepClustersOnTestFailure: cfg.debugEnabled,
clusterID: cfg.clusterID,
typ: clusterType,
clusterName: clusterName,
user: getUser(cfg.user),
cpuQuota: cfg.cpuQuota,
debugMode: cfg.debugMode,
clusterID: cfg.clusterID,
}
if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil {
return err
Expand All @@ -424,6 +437,10 @@ func runTests(register func(registry.Registry), cfg cliCfg) error {
// stdout/stderr.
cfg.parallelism = n * cfg.count
}
if opt.debugMode == DebugKeepAlways && n > 1 {
return errors.Newf("--debug-always is only allowed when running a single test")
}

runnerDir := filepath.Join(cfg.artifactsDir, runnerLogsDir)
runnerLogPath := filepath.Join(
runnerDir, fmt.Sprintf("test_runner-%d.log", timeutil.Now().Unix()))
Expand Down
41 changes: 32 additions & 9 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,28 @@ type clustersOpt struct {
// test runner will wait for other tests to finish and their cluster to be
// destroyed (or reused). Note that this limit is global, not per zone.
cpuQuota int
// If set, clusters will not be wiped or destroyed when a test using the
// respective cluster fails. These cluster will linger around and they'll
// continue counting towards the cpuQuota.
keepClustersOnTestFailure bool

// Controls whether the cluster is cleaned up at the end of the test.
debugMode debugMode
}

type debugMode int

const (
// NoDebug does not enable any debug behaviour. Clusters will
// be destroyed regardless of the test result.
NoDebug debugMode = iota
// DebugKeepOnFailure does not wipe or destroy a cluster when
// a test using the respective cluster fails. These clusters
// will linger around and they'll continue counting towards
// the cpuQuota.
DebugKeepOnFailure
// DebugKeepAlways never wipes or destroys a cluster.
DebugKeepAlways
)

func (p debugMode) IsDebug() bool {
return p == DebugKeepOnFailure || p == DebugKeepAlways
}

func (c clustersOpt) validate() error {
Expand Down Expand Up @@ -272,7 +290,7 @@ func (r *testRunner) Run(
err := r.runWorker(
ctx, fmt.Sprintf("w%d", i) /* name */, r.work, qp,
r.stopper.ShouldQuiesce(),
clustersOpt.keepClustersOnTestFailure,
clustersOpt.debugMode,
lopt.artifactsDir, lopt.literalArtifactsDir, lopt.tee, lopt.stdout,
clusterAllocator,
topt,
Expand Down Expand Up @@ -449,7 +467,7 @@ func (r *testRunner) runWorker(
work *workPool,
qp *quotapool.IntPool,
interrupt <-chan struct{},
debug bool,
debugMode debugMode,
artifactsRootDir string,
literalArtifactsDir string,
teeOpt logger.TeeOptType,
Expand Down Expand Up @@ -615,7 +633,7 @@ func (r *testRunner) runWorker(
l: testL,
versionsBinaryOverride: topt.versionsBinaryOverride,
skipInit: topt.skipInit,
debug: debug,
debug: debugMode.IsDebug(),
}
// Now run the test.
l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum)
Expand Down Expand Up @@ -685,13 +703,14 @@ func (r *testRunner) runWorker(
failureMsg += t.failureMsg()
}
if c != nil {
if debug {
switch debugMode {
case DebugKeepAlways, DebugKeepOnFailure:
// Save the cluster for future debugging.
c.Save(ctx, failureMsg, l)

// Continue with a fresh cluster.
c = nil
} else {
case NoDebug:
// On any test failure or error, we destroy the cluster. We could be
// more selective, but this sounds safer.
l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg)
Expand All @@ -706,6 +725,10 @@ func (r *testRunner) runWorker(
} else {
// Upon success fetch the perf artifacts from the remote hosts.
getPerfArtifacts(ctx, l, c, t)
if debugMode == DebugKeepAlways {
c.Save(ctx, "cluster saved since --debug-always set", l)
c = nil
}
}
}
}
Expand Down
16 changes: 8 additions & 8 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,10 @@ func setupRunnerTest(t *testing.T, r testRegistryImpl, testFilters []string) *ru
artifactsDir: "",
}
copt := clustersOpt{
typ: roachprodCluster,
user: "test_user",
cpuQuota: 1000,
keepClustersOnTestFailure: false,
typ: roachprodCluster,
user: "test_user",
cpuQuota: 1000,
debugMode: NoDebug,
}
return &runnerTest{
stdout: &stdout,
Expand Down Expand Up @@ -337,10 +337,10 @@ func TestRunnerTestTimeout(t *testing.T) {
artifactsDir: "",
}
copt := clustersOpt{
typ: roachprodCluster,
user: "test_user",
cpuQuota: 1000,
keepClustersOnTestFailure: false,
typ: roachprodCluster,
user: "test_user",
cpuQuota: 1000,
debugMode: NoDebug,
}
test := registry.TestSpec{
Name: `timeout`,
Expand Down

0 comments on commit c6badb4

Please sign in to comment.