roachtest: create "perf" artifacts, write histograms there, collect them

Before the roachtest rewrite in #30977 we used to collect the logs directory from remote hosts unconditionally. We also used to write stats.json histogram files to the logs directory. This combination enabled the collection of histogram files for roachperf. Since the new roachtest only copies logs to the test runner in the case of failure, we have not gotten any perf artifacts since the change. This log policy is sane for logs and other debug artifacts like heap profiles that exist there but has been a bit of a bummer as we've missed out on the dopamine rush due to seeing charts go up and to the right in bespoke D3 glory. The truth is that we weren't all that excited about the previous behavior with regards to perf data either. Sometimes tests would fail and their lousy results would mess with our pretty pictures (https://github.com/cockroachdb/roachperf/issues/37). This PR introduces a new concept, perf artifacts, which live in a different directory, the perfArtifactsDir ("perf") and has the opposite fetching policy from the logs; the "perf" directory is copied from each remove host into the test's artifactsDir only upon successful completion of the test. One downside of this as implemented is that it can spam the test's log (but fortunately not roachtest's logs anymore, <3 @andreimatei). #38890 attempts to address this somewhat. It shouldn't been too big of a problem because it only happens on tests that already passed. This change also necessitates a small change to roachperf to look in this new "perf" directory (and respect the rewrite's test artifact directory structure). Fixes #38871. Release note: None
cockroachdb · Jul 16, 2019 · 3cf2306 · 3cf2306
1 parent 02d52a6
commit 3cf2306
Show file tree

Hide file tree

Showing 11 changed files with 32 additions and 13 deletions.
diff --git a/pkg/cmd/roachtest/indexes.go b/pkg/cmd/roachtest/indexes.go
@@ -56,7 +56,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 				payload := " --payload=256"
 				concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
-				runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+
+				runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+
 					payload+concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, runCmd)
 				return nil

diff --git a/pkg/cmd/roachtest/interleavedpartitioned.go b/pkg/cmd/roachtest/interleavedpartitioned.go
@@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) {
 		c.Run(ctx, cockroachEast.randNode(), cmdInit)
 
 		duration := " --duration " + ifLocal("10s", "10m")
-		histograms := " --histograms logs/stats.json"
+		histograms := " --histograms=" + perfArtifactsDir + "/stats.json"
 
 		createCmd := func(locality string, cockroachNodes nodeListOption) string {
 			return fmt.Sprintf(

diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go
@@ -47,7 +47,7 @@ func registerKV(r *testRegistry) {
 			splits := " --splits=1000"
 			duration := " --duration=" + ifLocal("10s", "10m")
 			readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent)
-
+			histograms := "--histograms=" + perfArtifactsDir + "/stats.json"
 			var batchSize string
 			if opts.batchSize > 0 {
 				batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize)
@@ -64,9 +64,8 @@ func registerKV(r *testRegistry) {
 				splits = "" // no splits
 				sequential = " --sequential"
 			}
-
-			cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+
-				concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
+			cmd := fmt.Sprintf("./workload run kv --init"+
+				histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
 				" {pgurl:1-%d}", nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
 			return nil

diff --git a/pkg/cmd/roachtest/ledger.go b/pkg/cmd/roachtest/ledger.go
@@ -36,7 +36,7 @@ func registerLedger(r *testRegistry) {
 				concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
 
-				cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+
+				cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+
 					concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, cmd)
 				return nil

diff --git a/pkg/cmd/roachtest/network.go b/pkg/cmd/roachtest/network.go
@@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
 		}
 
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}",
+			"./workload run tpcc --warehouses=%d --wait=false"+
+				" --histograms="+perfArtifactsDir+"/stats.json"+
+				" --duration=%s {pgurl:2-%d}",
 			warehouses, duration, c.spec.NodeCount-1)
 		return c.RunL(ctx, tpccL, workerNode, cmd)
 	})

diff --git a/pkg/cmd/roachtest/queue.go b/pkg/cmd/roachtest/queue.go
@@ -52,7 +52,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) {
 				init = " --init"
 			}
 			cmd := fmt.Sprintf(
-				"./workload run queue --histograms=logs/stats.json"+
+				"./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+
 					init+
 					concurrency+
 					duration+

diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
@@ -70,6 +70,11 @@ type testSpec struct {
 	Run func(ctx context.Context, t *test, c *cluster)
 }
 
+// perfArtifactsDir is the directory on cluster nodes in which perf artifacts
+// reside. Upon success this directory is copied into test artifactsDir from
+// each node in the cluster.
+const perfArtifactsDir = "perf"
+
 // matchOrSkip returns true if the filter matches the test. If the filter does
 // not match the test because the tag filter does not match, the test is
 // matched, but marked as skipped.

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -491,6 +491,19 @@ func (r *testRunner) runWorker(
 			if err != nil {
 				return err
 			}
+		} else {
+			// Upon success fetch the perf artifacts from the remote hosts.
+			// If there's an error, oh well, don't do anything rash like fail the test
+			// which already passed.
+			//
+			// TODO(ajwerner): this Get on all nodes has the unfortunate side effect
+			// of logging an error in the test runner log for all of the nodes which
+			// do not have perf artifacts. Ideally we'd have the test tell us which
+			// nodes have the artifacts, or we'd go check explicitly, or we'd find a
+			// way for Get to not complain if a file does not exist.
+			if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil {
+				l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
+			}
 		}
 	}
 }

diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
@@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) {
 	m.Go(func(ctx context.Context) error {
 		t.WorkerStatus("running tpcc")
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+
+			"./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+
 				opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}",
 			opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1)
 		c.Run(ctx, workloadNode, cmd)
@@ -781,7 +781,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 					}
 
 					t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses))
-					histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses)
+					histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses)
 					cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+
 						"--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+
 						"--histograms=%s",

diff --git a/pkg/cmd/roachtest/tpchbench.go b/pkg/cmd/roachtest/tpchbench.go
@@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) {
 		cmd := fmt.Sprintf(
 			"./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+
 				"--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+
-				"--histograms=logs/stats.json --histograms-max-latency=%s",
+				"--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s",
 			filename,
 			b.numRunsPerQuery,
 			maxOps,

diff --git a/pkg/cmd/roachtest/ycsb.go b/pkg/cmd/roachtest/ycsb.go
@@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) {
 			duration := " --duration=" + ifLocal("10s", "10m")
 			cmd := fmt.Sprintf(
 				"./workload run ycsb --init --record-count=1000000 --splits=100"+
-					" --workload=%s --concurrency=64 --histograms=logs/stats.json"+
+					" --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+
 					ramp+duration+" {pgurl:1-%d}",
 				wl, nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)