From 0492bb9d59412fc7003c500d3fa90af3f9828cd6 Mon Sep 17 00:00:00 2001
From: Andrew Werner <ajwerner@cockroachlabs.com>
Date: Mon, 15 Jul 2019 19:21:10 -0400
Subject: [PATCH] roachtest: create perf artifacts concept and collect them

Before the roachtest rewrite in #30977 we used to collect the logs directory
from remote hosts unconditionally. We also used to write stats.json histogram
files to the logs directory. This combination enabled the collection of
histogram files for roachperf. Since the new roachtest does not unconditionally
copy logs to the test runner, we have not gotten any perf artifacts since the
change.

This PR introduces a new concept, perf artifacts, which live in a different
directory, the perfArtifactsDir. If a testSpec indicates that it
HasPerfArtifacts then that directory is copied to the test runner's
artifactsDir for that test from each node in the cluster upon success.

This change also necessitates a small change to roachperf to look in this new
"perf" directory.

Fixes #38871.

Release note: None
---
 pkg/cmd/roachtest/indexes.go                |  5 +++--
 pkg/cmd/roachtest/interleavedpartitioned.go |  7 ++++---
 pkg/cmd/roachtest/kv.go                     | 15 ++++++++-------
 pkg/cmd/roachtest/ledger.go                 |  7 ++++---
 pkg/cmd/roachtest/network.go                |  9 ++++++---
 pkg/cmd/roachtest/queue.go                  |  9 +++++----
 pkg/cmd/roachtest/test.go                   | 13 +++++++++++++
 pkg/cmd/roachtest/test_runner.go            | 13 +++++++++++++
 pkg/cmd/roachtest/tpcc.go                   | 20 +++++++++++---------
 pkg/cmd/roachtest/tpchbench.go              |  9 +++++----
 pkg/cmd/roachtest/ycsb.go                   |  7 ++++---
 11 files changed, 76 insertions(+), 38 deletions(-)

diff --git a/pkg/cmd/roachtest/indexes.go b/pkg/cmd/roachtest/indexes.go
index 1ff852018027..75422f253120 100644
--- a/pkg/cmd/roachtest/indexes.go
+++ b/pkg/cmd/roachtest/indexes.go
@@ -25,7 +25,8 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 		Name:    fmt.Sprintf("indexes/%d/nodes=%d/multi-region", secondaryIndexes, nodes),
 		Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(geoZonesStr)),
 		// Uses CONFIGURE ZONE USING ... COPY FROM PARENT syntax.
-		MinVersion: `v19.1.0`,
+		MinVersion:       `v19.1.0`,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			firstAZ := geoZones[0]
 			roachNodes := c.Range(1, nodes)
@@ -56,7 +57,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 				payload := " --payload=256"
 				concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
-				runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+
+				runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+
 					payload+concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, runCmd)
 				return nil
diff --git a/pkg/cmd/roachtest/interleavedpartitioned.go b/pkg/cmd/roachtest/interleavedpartitioned.go
index d726ce7a5a58..9453deaba0e0 100644
--- a/pkg/cmd/roachtest/interleavedpartitioned.go
+++ b/pkg/cmd/roachtest/interleavedpartitioned.go
@@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) {
 		c.Run(ctx, cockroachEast.randNode(), cmdInit)
 
 		duration := " --duration " + ifLocal("10s", "10m")
-		histograms := " --histograms logs/stats.json"
+		histograms := " --histograms=" + perfArtifactsDir + "/stats.json"
 
 		createCmd := func(locality string, cockroachNodes nodeListOption) string {
 			return fmt.Sprintf(
@@ -123,8 +123,9 @@ func registerInterleaved(r *testRegistry) {
 	}
 
 	r.Add(testSpec{
-		Name:    "interleavedpartitioned",
-		Cluster: makeClusterSpec(12, geo(), zones("us-west1-b,us-east4-b,us-central1-a")),
+		Name:             "interleavedpartitioned",
+		Cluster:          makeClusterSpec(12, geo(), zones("us-west1-b,us-east4-b,us-central1-a")),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runInterleaved(ctx, t, c,
 				config{
diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go
index d9b85fd10acf..afeafe72405b 100644
--- a/pkg/cmd/roachtest/kv.go
+++ b/pkg/cmd/roachtest/kv.go
@@ -47,7 +47,7 @@ func registerKV(r *testRegistry) {
 			splits := " --splits=1000"
 			duration := " --duration=" + ifLocal("10s", "10m")
 			readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent)
-
+			histograms := "--histograms=" + perfArtifactsDir + "/stats.json"
 			var batchSize string
 			if opts.batchSize > 0 {
 				batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize)
@@ -64,9 +64,9 @@ func registerKV(r *testRegistry) {
 				splits = "" // no splits
 				sequential = " --sequential"
 			}
-
-			cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+
-				concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
+			t.ArtifactsDir()
+			cmd := fmt.Sprintf("./workload run kv --init"+
+				histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
 				" {pgurl:1-%d}", nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
 			return nil
@@ -139,9 +139,10 @@ func registerKV(r *testRegistry) {
 		}
 
 		r.Add(testSpec{
-			Name:       strings.Join(nameParts, "/"),
-			MinVersion: minVersion,
-			Cluster:    makeClusterSpec(opts.nodes+1, cpu(opts.cpus)),
+			Name:             strings.Join(nameParts, "/"),
+			MinVersion:       minVersion,
+			Cluster:          makeClusterSpec(opts.nodes+1, cpu(opts.cpus)),
+			HasPerfArtifacts: true,
 			Run: func(ctx context.Context, t *test, c *cluster) {
 				runKV(ctx, t, c, opts)
 			},
diff --git a/pkg/cmd/roachtest/ledger.go b/pkg/cmd/roachtest/ledger.go
index 2548082300c9..a41987815bd6 100644
--- a/pkg/cmd/roachtest/ledger.go
+++ b/pkg/cmd/roachtest/ledger.go
@@ -19,8 +19,9 @@ func registerLedger(r *testRegistry) {
 	const nodes = 6
 	const azs = "us-central1-a,us-central1-b,us-central1-c"
 	r.Add(testSpec{
-		Name:    fmt.Sprintf("ledger/nodes=%d/multi-az", nodes),
-		Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(azs)),
+		Name:             fmt.Sprintf("ledger/nodes=%d/multi-az", nodes),
+		Cluster:          makeClusterSpec(nodes+1, cpu(16), geo(), zones(azs)),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			roachNodes := c.Range(1, nodes)
 			gatewayNodes := c.Range(1, nodes/3)
@@ -36,7 +37,7 @@ func registerLedger(r *testRegistry) {
 				concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
 
-				cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+
+				cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+
 					concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, cmd)
 				return nil
diff --git a/pkg/cmd/roachtest/network.go b/pkg/cmd/roachtest/network.go
index db530df121c3..28f55a70b0e5 100644
--- a/pkg/cmd/roachtest/network.go
+++ b/pkg/cmd/roachtest/network.go
@@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
 		}
 
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}",
+			"./workload run tpcc --warehouses=%d --wait=false"+
+				" --histograms="+perfArtifactsDir+"/stats.json"+
+				" --duration=%s {pgurl:2-%d}",
 			warehouses, duration, c.spec.NodeCount-1)
 		return c.RunL(ctx, tpccL, workerNode, cmd)
 	})
@@ -244,8 +246,9 @@ func registerNetwork(r *testRegistry) {
 		},
 	})
 	r.Add(testSpec{
-		Name:    fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
-		Cluster: makeClusterSpec(numNodes),
+		Name:             fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
+		Cluster:          makeClusterSpec(numNodes),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runNetworkTPCC(ctx, t, c, numNodes)
 		},
diff --git a/pkg/cmd/roachtest/queue.go b/pkg/cmd/roachtest/queue.go
index b8e88f797bb5..e981d1c19b12 100644
--- a/pkg/cmd/roachtest/queue.go
+++ b/pkg/cmd/roachtest/queue.go
@@ -23,9 +23,10 @@ func registerQueue(r *testRegistry) {
 	// One node runs the workload generator, all other nodes host CockroachDB.
 	const numNodes = 2
 	r.Add(testSpec{
-		Skip:    "https://github.com/cockroachdb/cockroach/issues/17229",
-		Name:    fmt.Sprintf("queue/nodes=%d", numNodes-1),
-		Cluster: makeClusterSpec(numNodes),
+		Skip:             "https://github.com/cockroachdb/cockroach/issues/17229",
+		Name:             fmt.Sprintf("queue/nodes=%d", numNodes-1),
+		Cluster:          makeClusterSpec(numNodes),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runQueue(ctx, t, c)
 		},
@@ -52,7 +53,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) {
 				init = " --init"
 			}
 			cmd := fmt.Sprintf(
-				"./workload run queue --histograms=logs/stats.json"+
+				"./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+
 					init+
 					concurrency+
 					duration+
diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
index 01a0b3768815..b2fe2c7e7cca 100644
--- a/pkg/cmd/roachtest/test.go
+++ b/pkg/cmd/roachtest/test.go
@@ -66,10 +66,23 @@ type testSpec struct {
 	// care about.
 	UseIOBarrier bool
 
+	// HasPerfArtifacts is true is the test has perf artifacts in its
+	// perfArtifactsDir on remote nodes which should be retreived when the test
+	// completes. Because tests control which node has the artifacts we pull this
+	// directory from all of the nodes if it is non-empty.
+	// It will not cause an error if the perf artifacts directory does not exist.
+	HasPerfArtifacts bool
+
 	// Run is the test function.
 	Run func(ctx context.Context, t *test, c *cluster)
 }
 
+// perfArtifactsDir is the directory on cluster nodes in which perf artifacts
+// should be stored. If a test spec HasPerfArtifacts and passes, then the runner
+// will retreive that directory from all nodes and copy it to the test artifacts
+// directory.
+const perfArtifactsDir = "perf"
+
 // matchOrSkip returns true if the filter matches the test. If the filter does
 // not match the test because the tag filter does not match, the test is
 // matched, but marked as skipped.
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 50f47f6af44c..af42c01461a8 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -491,6 +491,19 @@ func (r *testRunner) runWorker(
 			if err != nil {
 				return err
 			}
+		} else if t.spec.HasPerfArtifacts {
+			// Fetch the perf artifacts from the remote hosts.
+			// If there's an error, oh well, don't do anything rash like fail the test
+			// which already passed.
+			//
+			// TODO(ajwerner): this Get on all nodes has the unfortunate side effect
+			// of logging an error in the test runner log for all of the nodes which
+			// do not have perf artifacts. Ideally we'd have the test tell us which
+			// nodes have the artifacts, or we'd go check explicitly, or we'd find a
+			// way for Get to not complain if a file does not exist.
+			if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil {
+				l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
+			}
 		}
 	}
 }
diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
index 721ed9726842..5e12b005b868 100644
--- a/pkg/cmd/roachtest/tpcc.go
+++ b/pkg/cmd/roachtest/tpcc.go
@@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) {
 	m.Go(func(ctx context.Context) error {
 		t.WorkerStatus("running tpcc")
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+
+			"./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+
 				opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}",
 			opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1)
 		c.Run(ctx, workloadNode, cmd)
@@ -238,9 +238,10 @@ func registerTPCC(r *testRegistry) {
 		Name: "tpcc/headroom/" + headroomSpec.String(),
 		// TODO(dan): Backfill tpccSupportedWarehouses and remove this "v2.1.0"
 		// minimum on gce.
-		MinVersion: maxVersion("v2.1.0", maybeMinVersionForFixturesImport(cloud)),
-		Tags:       []string{`default`, `release_qualification`},
-		Cluster:    headroomSpec,
+		MinVersion:       maxVersion("v2.1.0", maybeMinVersionForFixturesImport(cloud)),
+		Tags:             []string{`default`, `release_qualification`},
+		Cluster:          headroomSpec,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			maxWarehouses := maxSupportedTPCCWarehouses(r.buildVersion, cloud, t.spec.Cluster)
 			headroomWarehouses := int(float64(maxWarehouses) * 0.7)
@@ -551,10 +552,11 @@ func registerTPCCBenchSpec(r *testRegistry, b tpccBenchSpec) {
 	nodes := makeClusterSpec(numNodes, opts...)
 
 	r.Add(testSpec{
-		Name:       name,
-		Cluster:    nodes,
-		MinVersion: maybeMinVersionForFixturesImport(cloud),
-		Tags:       b.Tags,
+		Name:             name,
+		Cluster:          nodes,
+		MinVersion:       maybeMinVersionForFixturesImport(cloud),
+		Tags:             b.Tags,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runTPCCBench(ctx, t, c, b)
 		},
@@ -781,7 +783,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 					}
 
 					t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses))
-					histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses)
+					histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses)
 					cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+
 						"--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+
 						"--histograms=%s",
diff --git a/pkg/cmd/roachtest/tpchbench.go b/pkg/cmd/roachtest/tpchbench.go
index aa843505fecc..4e9de5f31a81 100644
--- a/pkg/cmd/roachtest/tpchbench.go
+++ b/pkg/cmd/roachtest/tpchbench.go
@@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) {
 		cmd := fmt.Sprintf(
 			"./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+
 				"--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+
-				"--histograms=logs/stats.json --histograms-max-latency=%s",
+				"--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s",
 			filename,
 			b.numRunsPerQuery,
 			maxOps,
@@ -237,9 +237,10 @@ func registerTPCHBenchSpec(r *testRegistry, b tpchBenchSpec) {
 	}
 
 	r.Add(testSpec{
-		Name:       strings.Join(nameParts, "/"),
-		Cluster:    makeClusterSpec(numNodes),
-		MinVersion: minVersion,
+		Name:             strings.Join(nameParts, "/"),
+		Cluster:          makeClusterSpec(numNodes),
+		MinVersion:       minVersion,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runTPCHBench(ctx, t, c, b)
 		},
diff --git a/pkg/cmd/roachtest/ycsb.go b/pkg/cmd/roachtest/ycsb.go
index 1657d02816d4..cbd00c9a0d63 100644
--- a/pkg/cmd/roachtest/ycsb.go
+++ b/pkg/cmd/roachtest/ycsb.go
@@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) {
 			duration := " --duration=" + ifLocal("10s", "10m")
 			cmd := fmt.Sprintf(
 				"./workload run ycsb --init --record-count=1000000 --splits=100"+
-					" --workload=%s --concurrency=64 --histograms=logs/stats.json"+
+					" --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+
 					ramp+duration+" {pgurl:1-%d}",
 				wl, nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
@@ -49,8 +49,9 @@ func registerYCSB(r *testRegistry) {
 			}
 			wl, cpus := wl, cpus
 			r.Add(testSpec{
-				Name:    name,
-				Cluster: makeClusterSpec(4, cpu(cpus)),
+				Name:             name,
+				Cluster:          makeClusterSpec(4, cpu(cpus)),
+				HasPerfArtifacts: true,
 				Run: func(ctx context.Context, t *test, c *cluster) {
 					runYCSB(ctx, t, c, wl, cpus)
 				},