roachtest: create perf artifacts concept and collect them

Before the roachtest rewrite in #30977 we used to collect the logs directory from remote hosts unconditionally. We also used to write stats.json histogram files to the logs directory. This combination enabled the collection of histogram files for roachperf. Since the new roachtest does not unconditionally copy logs to the test runner, we have not gotten any perf artifacts since the change. This PR introduces a new concept, perf artifacts, which live in a different directory, the perfArtifactsDir. If a testSpec indicates that it HasPerfArtifacts then that directory is copied to the test runner's artifactsDir for that test from each node in the cluster upon success. This change also necessitates a small change to roachperf to look in this new "perf" directory. Fixes #38871. Release note: None
cockroachdb · Jul 15, 2019 · 0492bb9 · 0492bb9
1 parent 02d52a6
commit 0492bb9
Show file tree

Hide file tree

Showing 11 changed files with 76 additions and 38 deletions.
diff --git a/pkg/cmd/roachtest/indexes.go b/pkg/cmd/roachtest/indexes.go
@@ -25,7 +25,8 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 		Name:    fmt.Sprintf("indexes/%d/nodes=%d/multi-region", secondaryIndexes, nodes),
 		Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(geoZonesStr)),
 		// Uses CONFIGURE ZONE USING ... COPY FROM PARENT syntax.
-		MinVersion: `v19.1.0`,
+		MinVersion:       `v19.1.0`,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			firstAZ := geoZones[0]
 			roachNodes := c.Range(1, nodes)
@@ -56,7 +57,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
 				payload := " --payload=256"
 				concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
-				runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+
+				runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+
 					payload+concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, runCmd)
 				return nil

diff --git a/pkg/cmd/roachtest/interleavedpartitioned.go b/pkg/cmd/roachtest/interleavedpartitioned.go
@@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) {
 		c.Run(ctx, cockroachEast.randNode(), cmdInit)
 
 		duration := " --duration " + ifLocal("10s", "10m")
-		histograms := " --histograms logs/stats.json"
+		histograms := " --histograms=" + perfArtifactsDir + "/stats.json"
 
 		createCmd := func(locality string, cockroachNodes nodeListOption) string {
 			return fmt.Sprintf(
@@ -123,8 +123,9 @@ func registerInterleaved(r *testRegistry) {
 	}
 
 	r.Add(testSpec{
-		Name:    "interleavedpartitioned",
-		Cluster: makeClusterSpec(12, geo(), zones("us-west1-b,us-east4-b,us-central1-a")),
+		Name:             "interleavedpartitioned",
+		Cluster:          makeClusterSpec(12, geo(), zones("us-west1-b,us-east4-b,us-central1-a")),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runInterleaved(ctx, t, c,
 				config{

diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go
@@ -47,7 +47,7 @@ func registerKV(r *testRegistry) {
 			splits := " --splits=1000"
 			duration := " --duration=" + ifLocal("10s", "10m")
 			readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent)
-
+			histograms := "--histograms=" + perfArtifactsDir + "/stats.json"
 			var batchSize string
 			if opts.batchSize > 0 {
 				batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize)
@@ -64,9 +64,9 @@ func registerKV(r *testRegistry) {
 				splits = "" // no splits
 				sequential = " --sequential"
 			}
-
-			cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+
-				concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
+			t.ArtifactsDir()
+			cmd := fmt.Sprintf("./workload run kv --init"+
+				histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
 				" {pgurl:1-%d}", nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
 			return nil
@@ -139,9 +139,10 @@ func registerKV(r *testRegistry) {
 		}
 
 		r.Add(testSpec{
-			Name:       strings.Join(nameParts, "/"),
-			MinVersion: minVersion,
-			Cluster:    makeClusterSpec(opts.nodes+1, cpu(opts.cpus)),
+			Name:             strings.Join(nameParts, "/"),
+			MinVersion:       minVersion,
+			Cluster:          makeClusterSpec(opts.nodes+1, cpu(opts.cpus)),
+			HasPerfArtifacts: true,
 			Run: func(ctx context.Context, t *test, c *cluster) {
 				runKV(ctx, t, c, opts)
 			},

diff --git a/pkg/cmd/roachtest/ledger.go b/pkg/cmd/roachtest/ledger.go
@@ -19,8 +19,9 @@ func registerLedger(r *testRegistry) {
 	const nodes = 6
 	const azs = "us-central1-a,us-central1-b,us-central1-c"
 	r.Add(testSpec{
-		Name:    fmt.Sprintf("ledger/nodes=%d/multi-az", nodes),
-		Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(azs)),
+		Name:             fmt.Sprintf("ledger/nodes=%d/multi-az", nodes),
+		Cluster:          makeClusterSpec(nodes+1, cpu(16), geo(), zones(azs)),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			roachNodes := c.Range(1, nodes)
 			gatewayNodes := c.Range(1, nodes/3)
@@ -36,7 +37,7 @@ func registerLedger(r *testRegistry) {
 				concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32))
 				duration := " --duration=" + ifLocal("10s", "30m")
 
-				cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+
+				cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+
 					concurrency+duration+" {pgurl%s}", gatewayNodes)
 				c.Run(ctx, loadNode, cmd)
 				return nil

diff --git a/pkg/cmd/roachtest/network.go b/pkg/cmd/roachtest/network.go
@@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
 		}
 
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}",
+			"./workload run tpcc --warehouses=%d --wait=false"+
+				" --histograms="+perfArtifactsDir+"/stats.json"+
+				" --duration=%s {pgurl:2-%d}",
 			warehouses, duration, c.spec.NodeCount-1)
 		return c.RunL(ctx, tpccL, workerNode, cmd)
 	})
@@ -244,8 +246,9 @@ func registerNetwork(r *testRegistry) {
 		},
 	})
 	r.Add(testSpec{
-		Name:    fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
-		Cluster: makeClusterSpec(numNodes),
+		Name:             fmt.Sprintf("network/tpcc/nodes=%d", numNodes),
+		Cluster:          makeClusterSpec(numNodes),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runNetworkTPCC(ctx, t, c, numNodes)
 		},

diff --git a/pkg/cmd/roachtest/queue.go b/pkg/cmd/roachtest/queue.go
@@ -23,9 +23,10 @@ func registerQueue(r *testRegistry) {
 	// One node runs the workload generator, all other nodes host CockroachDB.
 	const numNodes = 2
 	r.Add(testSpec{
-		Skip:    "https://github.com/cockroachdb/cockroach/issues/17229",
-		Name:    fmt.Sprintf("queue/nodes=%d", numNodes-1),
-		Cluster: makeClusterSpec(numNodes),
+		Skip:             "https://github.com/cockroachdb/cockroach/issues/17229",
+		Name:             fmt.Sprintf("queue/nodes=%d", numNodes-1),
+		Cluster:          makeClusterSpec(numNodes),
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runQueue(ctx, t, c)
 		},
@@ -52,7 +53,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) {
 				init = " --init"
 			}
 			cmd := fmt.Sprintf(
-				"./workload run queue --histograms=logs/stats.json"+
+				"./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+
 					init+
 					concurrency+
 					duration+

diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
@@ -66,10 +66,23 @@ type testSpec struct {
 	// care about.
 	UseIOBarrier bool
 
+	// HasPerfArtifacts is true is the test has perf artifacts in its
+	// perfArtifactsDir on remote nodes which should be retreived when the test
+	// completes. Because tests control which node has the artifacts we pull this
+	// directory from all of the nodes if it is non-empty.
+	// It will not cause an error if the perf artifacts directory does not exist.
+	HasPerfArtifacts bool
+
 	// Run is the test function.
 	Run func(ctx context.Context, t *test, c *cluster)
 }
 
+// perfArtifactsDir is the directory on cluster nodes in which perf artifacts
+// should be stored. If a test spec HasPerfArtifacts and passes, then the runner
+// will retreive that directory from all nodes and copy it to the test artifacts
+// directory.
+const perfArtifactsDir = "perf"
+
 // matchOrSkip returns true if the filter matches the test. If the filter does
 // not match the test because the tag filter does not match, the test is
 // matched, but marked as skipped.

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -491,6 +491,19 @@ func (r *testRunner) runWorker(
 			if err != nil {
 				return err
 			}
+		} else if t.spec.HasPerfArtifacts {
+			// Fetch the perf artifacts from the remote hosts.
+			// If there's an error, oh well, don't do anything rash like fail the test
+			// which already passed.
+			//
+			// TODO(ajwerner): this Get on all nodes has the unfortunate side effect
+			// of logging an error in the test runner log for all of the nodes which
+			// do not have perf artifacts. Ideally we'd have the test tell us which
+			// nodes have the artifacts, or we'd go check explicitly, or we'd find a
+			// way for Get to not complain if a file does not exist.
+			if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil {
+				l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
+			}
 		}
 	}
 }

diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
@@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) {
 	m.Go(func(ctx context.Context) error {
 		t.WorkerStatus("running tpcc")
 		cmd := fmt.Sprintf(
-			"./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+
+			"./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+
 				opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}",
 			opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1)
 		c.Run(ctx, workloadNode, cmd)
@@ -238,9 +238,10 @@ func registerTPCC(r *testRegistry) {
 		Name: "tpcc/headroom/" + headroomSpec.String(),
 		// TODO(dan): Backfill tpccSupportedWarehouses and remove this "v2.1.0"
 		// minimum on gce.
-		MinVersion: maxVersion("v2.1.0", maybeMinVersionForFixturesImport(cloud)),
-		Tags:       []string{`default`, `release_qualification`},
-		Cluster:    headroomSpec,
+		MinVersion:       maxVersion("v2.1.0", maybeMinVersionForFixturesImport(cloud)),
+		Tags:             []string{`default`, `release_qualification`},
+		Cluster:          headroomSpec,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			maxWarehouses := maxSupportedTPCCWarehouses(r.buildVersion, cloud, t.spec.Cluster)
 			headroomWarehouses := int(float64(maxWarehouses) * 0.7)
@@ -551,10 +552,11 @@ func registerTPCCBenchSpec(r *testRegistry, b tpccBenchSpec) {
 	nodes := makeClusterSpec(numNodes, opts...)
 
 	r.Add(testSpec{
-		Name:       name,
-		Cluster:    nodes,
-		MinVersion: maybeMinVersionForFixturesImport(cloud),
-		Tags:       b.Tags,
+		Name:             name,
+		Cluster:          nodes,
+		MinVersion:       maybeMinVersionForFixturesImport(cloud),
+		Tags:             b.Tags,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runTPCCBench(ctx, t, c, b)
 		},
@@ -781,7 +783,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 					}
 
 					t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses))
-					histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses)
+					histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses)
 					cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+
 						"--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+
 						"--histograms=%s",

diff --git a/pkg/cmd/roachtest/tpchbench.go b/pkg/cmd/roachtest/tpchbench.go
@@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) {
 		cmd := fmt.Sprintf(
 			"./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+
 				"--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+
-				"--histograms=logs/stats.json --histograms-max-latency=%s",
+				"--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s",
 			filename,
 			b.numRunsPerQuery,
 			maxOps,
@@ -237,9 +237,10 @@ func registerTPCHBenchSpec(r *testRegistry, b tpchBenchSpec) {
 	}
 
 	r.Add(testSpec{
-		Name:       strings.Join(nameParts, "/"),
-		Cluster:    makeClusterSpec(numNodes),
-		MinVersion: minVersion,
+		Name:             strings.Join(nameParts, "/"),
+		Cluster:          makeClusterSpec(numNodes),
+		MinVersion:       minVersion,
+		HasPerfArtifacts: true,
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			runTPCHBench(ctx, t, c, b)
 		},

diff --git a/pkg/cmd/roachtest/ycsb.go b/pkg/cmd/roachtest/ycsb.go
@@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) {
 			duration := " --duration=" + ifLocal("10s", "10m")
 			cmd := fmt.Sprintf(
 				"./workload run ycsb --init --record-count=1000000 --splits=100"+
-					" --workload=%s --concurrency=64 --histograms=logs/stats.json"+
+					" --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+
 					ramp+duration+" {pgurl:1-%d}",
 				wl, nodes)
 			c.Run(ctx, c.Node(nodes+1), cmd)
@@ -49,8 +49,9 @@ func registerYCSB(r *testRegistry) {
 			}
 			wl, cpus := wl, cpus
 			r.Add(testSpec{
-				Name:    name,
-				Cluster: makeClusterSpec(4, cpu(cpus)),
+				Name:             name,
+				Cluster:          makeClusterSpec(4, cpu(cpus)),
+				HasPerfArtifacts: true,
 				Run: func(ctx context.Context, t *test, c *cluster) {
 					runYCSB(ctx, t, c, wl, cpus)
 				},