From 3cf2306bbf45efd01f3243b74c6e259d5c4080b8 Mon Sep 17 00:00:00 2001
From: Andrew Werner <ajwerner@cockroachlabs.com>
Date: Mon, 15 Jul 2019 19:21:10 -0400
Subject: [PATCH] roachtest: create "perf" artifacts, write histograms there,
 collect them

Before the roachtest rewrite in #30977 we used to collect the logs directory
from remote hosts unconditionally. We also used to write stats.json histogram
files to the logs directory. This combination enabled the collection of
histogram files for roachperf. Since the new roachtest only copies logs
to the test runner in the case of failure, we have not gotten any perf
artifacts since the change.

This log policy is sane for logs and other debug artifacts like heap profiles
that exist there but has been a bit of a bummer as we've missed out on the
dopamine rush due to seeing charts go up and to the right in bespoke D3 glory.
The truth is that we weren't all that excited about the previous behavior with
regards to perf data either. Sometimes tests would fail and their lousy results
would mess with our pretty pictures
(https://github.com/cockroachdb/roachperf/issues/37).

This PR introduces a new concept, perf artifacts, which live in a different
directory, the perfArtifactsDir ("perf") and has the opposite fetching policy
from the logs; the "perf" directory is copied from each remove host into the
test's artifactsDir only upon successful completion of the test.

One downside of this as implemented is that it can spam the test's log (but
fortunately not roachtest's logs anymore, <3 @andreimatei). #38890 attempts to
address this somewhat. It shouldn't been too big of a problem because it only
happens on tests that already passed.

This change also necessitates a small change to roachperf to look in this new
"perf" directory (and respect the rewrite's test artifact directory structure).

Fixes #38871.

Release note: None
---
 pkg/cmd/roachtest/indexes.go                |  2 +-
 pkg/cmd/roachtest/interleavedpartitioned.go |  2 +-
 pkg/cmd/roachtest/kv.go                     |  7 +++----
 pkg/cmd/roachtest/ledger.go                 |  2 +-
 pkg/cmd/roachtest/network.go                |  4 +++-
 pkg/cmd/roachtest/queue.go                  |  2 +-
 pkg/cmd/roachtest/test.go                   |  5 +++++
 pkg/cmd/roachtest/test_runner.go            | 13 +++++++++++++
 pkg/cmd/roachtest/tpcc.go                   |  4 ++--
 pkg/cmd/roachtest/tpchbench.go              |  2 +-
 pkg/cmd/roachtest/ycsb.go                   |  2 +-
 11 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/pkg/cmd/roachtest/indexes.go b/pkg/cmd/roachtest/indexes.go
index 1ff852018027..1f4a2cc9d225 100644
--- a/pkg/cmd/roachtest/indexes.go
+++ b/pkg/cmd/roachtest/indexes.go
@@ -56,7 +56,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 				payload := " --payload=256"
 				concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
-				runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+
+				runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+
 					payload+concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, runCmd)
 				return nil
diff --git a/pkg/cmd/roachtest/interleavedpartitioned.go b/pkg/cmd/roachtest/interleavedpartitioned.go
index d726ce7a5a58..a6f1324db628 100644
--- a/pkg/cmd/roachtest/interleavedpartitioned.go
+++ b/pkg/cmd/roachtest/interleavedpartitioned.go
@@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) {
 		c.Run(ctx, cockroachEast.randNode(), cmdInit)
 
 		duration := " --duration " + ifLocal("10s", "10m")
-		histograms := " --histograms logs/stats.json"
+		histograms := " --histograms=" + perfArtifactsDir + "/stats.json"
 
 		createCmd := func(locality string, cockroachNodes nodeListOption) string {
 			return fmt.Sprintf(
diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go
index d9b85fd10acf..7416f00b5858 100644
--- a/pkg/cmd/roachtest/kv.go
+++ b/pkg/cmd/roachtest/kv.go
@@ -47,7 +47,7 @@ func registerKV(r *testRegistry) {
 			splits := " --splits=1000"
 			duration := " --duration=" + ifLocal("10s", "10m")
 			readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent)
-
+			histograms := "--histograms=" + perfArtifactsDir + "/stats.json"
 			var batchSize string
 			if opts.batchSize > 0 {
 				batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize)
@@ -64,9 +64,8 @@ func registerKV(r *testRegistry) {
 				splits = "" // no splits
 				sequential = " --sequential"
 			}
-
-			cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+
-				concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
+			cmd := fmt.Sprintf("./workload run kv --init"+
+				histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
 				" {pgurl:1-%d}", nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
 			return nil
diff --git a/pkg/cmd/roachtest/ledger.go b/pkg/cmd/roachtest/ledger.go
index 2548082300c9..cfa59bec9a29 100644
--- a/pkg/cmd/roachtest/ledger.go
+++ b/pkg/cmd/roachtest/ledger.go
@@ -36,7 +36,7 @@ func registerLedger(r *testRegistry) {
 				concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
 
-				cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+
+				cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+
 					concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, cmd)
 				return nil
diff --git a/pkg/cmd/roachtest/network.go b/pkg/cmd/roachtest/network.go
index db530df121c3..39aa5032e9ed 100644
--- a/pkg/cmd/roachtest/network.go
+++ b/pkg/cmd/roachtest/network.go
@@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
 		}
 
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}",
+			"./workload run tpcc --warehouses=%d --wait=false"+
+				" --histograms="+perfArtifactsDir+"/stats.json"+
+				" --duration=%s {pgurl:2-%d}",
 			warehouses, duration, c.spec.NodeCount-1)
 		return c.RunL(ctx, tpccL, workerNode, cmd)
 	})
diff --git a/pkg/cmd/roachtest/queue.go b/pkg/cmd/roachtest/queue.go
index b8e88f797bb5..f3647a8f6620 100644
--- a/pkg/cmd/roachtest/queue.go
+++ b/pkg/cmd/roachtest/queue.go
@@ -52,7 +52,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) {
 				init = " --init"
 			}
 			cmd := fmt.Sprintf(
-				"./workload run queue --histograms=logs/stats.json"+
+				"./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+
 					init+
 					concurrency+
 					duration+
diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
index 01a0b3768815..f7ed3779c81f 100644
--- a/pkg/cmd/roachtest/test.go
+++ b/pkg/cmd/roachtest/test.go
@@ -70,6 +70,11 @@ type testSpec struct {
 	Run func(ctx context.Context, t *test, c *cluster)
 }
 
+// perfArtifactsDir is the directory on cluster nodes in which perf artifacts
+// reside. Upon success this directory is copied into test artifactsDir from
+// each node in the cluster.
+const perfArtifactsDir = "perf"
+
 // matchOrSkip returns true if the filter matches the test. If the filter does
 // not match the test because the tag filter does not match, the test is
 // matched, but marked as skipped.
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 50f47f6af44c..0e4407cc3e09 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -491,6 +491,19 @@ func (r *testRunner) runWorker(
 			if err != nil {
 				return err
 			}
+		} else {
+			// Upon success fetch the perf artifacts from the remote hosts.
+			// If there's an error, oh well, don't do anything rash like fail the test
+			// which already passed.
+			//
+			// TODO(ajwerner): this Get on all nodes has the unfortunate side effect
+			// of logging an error in the test runner log for all of the nodes which
+			// do not have perf artifacts. Ideally we'd have the test tell us which
+			// nodes have the artifacts, or we'd go check explicitly, or we'd find a
+			// way for Get to not complain if a file does not exist.
+			if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil {
+				l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
+			}
 		}
 	}
 }
diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
index 721ed9726842..30eb0ea96078 100644
--- a/pkg/cmd/roachtest/tpcc.go
+++ b/pkg/cmd/roachtest/tpcc.go
@@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) {
 	m.Go(func(ctx context.Context) error {
 		t.WorkerStatus("running tpcc")
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+
+			"./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+
 				opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}",
 			opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1)
 		c.Run(ctx, workloadNode, cmd)
@@ -781,7 +781,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 					}
 
 					t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses))
-					histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses)
+					histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses)
 					cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+
 						"--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+
 						"--histograms=%s",
diff --git a/pkg/cmd/roachtest/tpchbench.go b/pkg/cmd/roachtest/tpchbench.go
index aa843505fecc..b4a9d04271ad 100644
--- a/pkg/cmd/roachtest/tpchbench.go
+++ b/pkg/cmd/roachtest/tpchbench.go
@@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) {
 		cmd := fmt.Sprintf(
 			"./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+
 				"--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+
-				"--histograms=logs/stats.json --histograms-max-latency=%s",
+				"--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s",
 			filename,
 			b.numRunsPerQuery,
 			maxOps,
diff --git a/pkg/cmd/roachtest/ycsb.go b/pkg/cmd/roachtest/ycsb.go
index 1657d02816d4..78c4d4a85b9d 100644
--- a/pkg/cmd/roachtest/ycsb.go
+++ b/pkg/cmd/roachtest/ycsb.go
@@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) {
 			duration := " --duration=" + ifLocal("10s", "10m")
 			cmd := fmt.Sprintf(
 				"./workload run ycsb --init --record-count=1000000 --splits=100"+
-					" --workload=%s --concurrency=64 --histograms=logs/stats.json"+
+					" --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+
 					ramp+duration+" {pgurl:1-%d}",
 				wl, nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)