From 3cf2306bbf45efd01f3243b74c6e259d5c4080b8 Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Mon, 15 Jul 2019 19:21:10 -0400 Subject: [PATCH] roachtest: create "perf" artifacts, write histograms there, collect them Before the roachtest rewrite in #30977 we used to collect the logs directory from remote hosts unconditionally. We also used to write stats.json histogram files to the logs directory. This combination enabled the collection of histogram files for roachperf. Since the new roachtest only copies logs to the test runner in the case of failure, we have not gotten any perf artifacts since the change. This log policy is sane for logs and other debug artifacts like heap profiles that exist there but has been a bit of a bummer as we've missed out on the dopamine rush due to seeing charts go up and to the right in bespoke D3 glory. The truth is that we weren't all that excited about the previous behavior with regards to perf data either. Sometimes tests would fail and their lousy results would mess with our pretty pictures (https://github.com/cockroachdb/roachperf/issues/37). This PR introduces a new concept, perf artifacts, which live in a different directory, the perfArtifactsDir ("perf") and has the opposite fetching policy from the logs; the "perf" directory is copied from each remove host into the test's artifactsDir only upon successful completion of the test. One downside of this as implemented is that it can spam the test's log (but fortunately not roachtest's logs anymore, <3 @andreimatei). #38890 attempts to address this somewhat. It shouldn't been too big of a problem because it only happens on tests that already passed. This change also necessitates a small change to roachperf to look in this new "perf" directory (and respect the rewrite's test artifact directory structure). Fixes #38871. Release note: None --- pkg/cmd/roachtest/indexes.go | 2 +- pkg/cmd/roachtest/interleavedpartitioned.go | 2 +- pkg/cmd/roachtest/kv.go | 7 +++---- pkg/cmd/roachtest/ledger.go | 2 +- pkg/cmd/roachtest/network.go | 4 +++- pkg/cmd/roachtest/queue.go | 2 +- pkg/cmd/roachtest/test.go | 5 +++++ pkg/cmd/roachtest/test_runner.go | 13 +++++++++++++ pkg/cmd/roachtest/tpcc.go | 4 ++-- pkg/cmd/roachtest/tpchbench.go | 2 +- pkg/cmd/roachtest/ycsb.go | 2 +- 11 files changed, 32 insertions(+), 13 deletions(-) diff --git a/pkg/cmd/roachtest/indexes.go b/pkg/cmd/roachtest/indexes.go index 1ff852018027..1f4a2cc9d225 100644 --- a/pkg/cmd/roachtest/indexes.go +++ b/pkg/cmd/roachtest/indexes.go @@ -56,7 +56,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) { payload := " --payload=256" concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32)) duration := " --duration=" + ifLocal("10s", "30m") - runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+ + runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+ payload+concurrency+duration+" {pgurl%s}", gatewayNodes) c.Run(ctx, loadNode, runCmd) return nil diff --git a/pkg/cmd/roachtest/interleavedpartitioned.go b/pkg/cmd/roachtest/interleavedpartitioned.go index d726ce7a5a58..a6f1324db628 100644 --- a/pkg/cmd/roachtest/interleavedpartitioned.go +++ b/pkg/cmd/roachtest/interleavedpartitioned.go @@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) { c.Run(ctx, cockroachEast.randNode(), cmdInit) duration := " --duration " + ifLocal("10s", "10m") - histograms := " --histograms logs/stats.json" + histograms := " --histograms=" + perfArtifactsDir + "/stats.json" createCmd := func(locality string, cockroachNodes nodeListOption) string { return fmt.Sprintf( diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go index d9b85fd10acf..7416f00b5858 100644 --- a/pkg/cmd/roachtest/kv.go +++ b/pkg/cmd/roachtest/kv.go @@ -47,7 +47,7 @@ func registerKV(r *testRegistry) { splits := " --splits=1000" duration := " --duration=" + ifLocal("10s", "10m") readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent) - + histograms := "--histograms=" + perfArtifactsDir + "/stats.json" var batchSize string if opts.batchSize > 0 { batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize) @@ -64,9 +64,8 @@ func registerKV(r *testRegistry) { splits = "" // no splits sequential = " --sequential" } - - cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+ - concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+ + cmd := fmt.Sprintf("./workload run kv --init"+ + histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+ " {pgurl:1-%d}", nodes) c.Run(ctx, c.Node(nodes+1), cmd) return nil diff --git a/pkg/cmd/roachtest/ledger.go b/pkg/cmd/roachtest/ledger.go index 2548082300c9..cfa59bec9a29 100644 --- a/pkg/cmd/roachtest/ledger.go +++ b/pkg/cmd/roachtest/ledger.go @@ -36,7 +36,7 @@ func registerLedger(r *testRegistry) { concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32)) duration := " --duration=" + ifLocal("10s", "30m") - cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+ + cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+ concurrency+duration+" {pgurl%s}", gatewayNodes) c.Run(ctx, loadNode, cmd) return nil diff --git a/pkg/cmd/roachtest/network.go b/pkg/cmd/roachtest/network.go index db530df121c3..39aa5032e9ed 100644 --- a/pkg/cmd/roachtest/network.go +++ b/pkg/cmd/roachtest/network.go @@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) { } cmd := fmt.Sprintf( - "./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}", + "./workload run tpcc --warehouses=%d --wait=false"+ + " --histograms="+perfArtifactsDir+"/stats.json"+ + " --duration=%s {pgurl:2-%d}", warehouses, duration, c.spec.NodeCount-1) return c.RunL(ctx, tpccL, workerNode, cmd) }) diff --git a/pkg/cmd/roachtest/queue.go b/pkg/cmd/roachtest/queue.go index b8e88f797bb5..f3647a8f6620 100644 --- a/pkg/cmd/roachtest/queue.go +++ b/pkg/cmd/roachtest/queue.go @@ -52,7 +52,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) { init = " --init" } cmd := fmt.Sprintf( - "./workload run queue --histograms=logs/stats.json"+ + "./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+ init+ concurrency+ duration+ diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go index 01a0b3768815..f7ed3779c81f 100644 --- a/pkg/cmd/roachtest/test.go +++ b/pkg/cmd/roachtest/test.go @@ -70,6 +70,11 @@ type testSpec struct { Run func(ctx context.Context, t *test, c *cluster) } +// perfArtifactsDir is the directory on cluster nodes in which perf artifacts +// reside. Upon success this directory is copied into test artifactsDir from +// each node in the cluster. +const perfArtifactsDir = "perf" + // matchOrSkip returns true if the filter matches the test. If the filter does // not match the test because the tag filter does not match, the test is // matched, but marked as skipped. diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 50f47f6af44c..0e4407cc3e09 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -491,6 +491,19 @@ func (r *testRunner) runWorker( if err != nil { return err } + } else { + // Upon success fetch the perf artifacts from the remote hosts. + // If there's an error, oh well, don't do anything rash like fail the test + // which already passed. + // + // TODO(ajwerner): this Get on all nodes has the unfortunate side effect + // of logging an error in the test runner log for all of the nodes which + // do not have perf artifacts. Ideally we'd have the test tell us which + // nodes have the artifacts, or we'd go check explicitly, or we'd find a + // way for Get to not complain if a file does not exist. + if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil { + l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err) + } } } } diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go index 721ed9726842..30eb0ea96078 100644 --- a/pkg/cmd/roachtest/tpcc.go +++ b/pkg/cmd/roachtest/tpcc.go @@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) { m.Go(func(ctx context.Context) error { t.WorkerStatus("running tpcc") cmd := fmt.Sprintf( - "./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+ + "./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+ opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}", opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1) c.Run(ctx, workloadNode, cmd) @@ -781,7 +781,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { } t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses)) - histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses) + histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses) cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+ "--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+ "--histograms=%s", diff --git a/pkg/cmd/roachtest/tpchbench.go b/pkg/cmd/roachtest/tpchbench.go index aa843505fecc..b4a9d04271ad 100644 --- a/pkg/cmd/roachtest/tpchbench.go +++ b/pkg/cmd/roachtest/tpchbench.go @@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) { cmd := fmt.Sprintf( "./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+ "--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+ - "--histograms=logs/stats.json --histograms-max-latency=%s", + "--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s", filename, b.numRunsPerQuery, maxOps, diff --git a/pkg/cmd/roachtest/ycsb.go b/pkg/cmd/roachtest/ycsb.go index 1657d02816d4..78c4d4a85b9d 100644 --- a/pkg/cmd/roachtest/ycsb.go +++ b/pkg/cmd/roachtest/ycsb.go @@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) { duration := " --duration=" + ifLocal("10s", "10m") cmd := fmt.Sprintf( "./workload run ycsb --init --record-count=1000000 --splits=100"+ - " --workload=%s --concurrency=64 --histograms=logs/stats.json"+ + " --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+ ramp+duration+" {pgurl:1-%d}", wl, nodes) c.Run(ctx, c.Node(nodes+1), cmd)