From 45b20bc95b01bf27be07f1769d1110b072d44b04 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Wed, 2 Aug 2023 19:43:37 +0000 Subject: [PATCH] roachtest: set `test_run_id` label which will be scraped for metrics The `test_run_id` will be unique per invocation of `roachtest`. It's purpose is to simplify finding metrics for a given run. TeamCity invoked roachtests, such as GCE Roachtest Nightly, will have a `test_run_id` in the form `-` to make it easy to find metrics for a particular build. Roachtests run by individual users will have a `test_run_id` in the form `-`. Epic: none Release note: None --- pkg/cmd/roachtest/cluster.go | 8 ++++ pkg/cmd/roachtest/main.go | 72 +++++++++++++------------------- pkg/cmd/roachtest/test_runner.go | 24 ++++++++++- pkg/roachprod/vm/vm.go | 1 + 4 files changed, 61 insertions(+), 44 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 297c453de58f..c3f8b22130ed 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -615,6 +615,11 @@ func (r *clusterRegistry) registerCluster(c *clusterImpl) error { return fmt.Errorf("cluster named %q already exists in registry", c.name) } r.mu.clusters[c.name] = c + if err := c.addLabels(map[string]string{ + VmLabelTestRunID: runID, + }); err != nil && c.l != nil { + c.l.Printf("failed to add %s label to cluster: %s", VmLabelTestRunID, err) + } return nil } @@ -626,6 +631,9 @@ func (r *clusterRegistry) unregisterCluster(c *clusterImpl) bool { // method to be called defensively. return false } + if err := c.removeLabels([]string{VmLabelTestRunID}); err != nil && c.l != nil { + c.l.Printf("failed to remove %s label from cluster: %s", VmLabelTestRunID, err) + } delete(r.mu.clusters, c.name) if c.tag != "" { if _, ok := r.mu.tagCount[c.tag]; !ok { diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index bebf18e13824..daeb652aeef4 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -279,6 +279,29 @@ Examples: listCmd.Flags().StringVar( &cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)") + runFn := func(args []string, benchOnly bool) error { + if literalArtifacts == "" { + literalArtifacts = artifacts + } + return runTests(tests.RegisterTests, cliCfg{ + args: args, + count: count, + cpuQuota: cpuQuota, + runSkipped: runSkipped, + debugMode: debugModeFromOpts(), + skipInit: skipInit, + httpPort: httpPort, + promPort: promPort, + parallelism: parallelism, + artifactsDir: artifacts, + literalArtifactsDir: literalArtifacts, + user: getUser(username), + clusterID: clusterID, + versionsBinaryOverride: versionsBinaryOverride, + selectProbability: selectProbability, + }, benchOnly) + } + var runCmd = &cobra.Command{ // Don't display usage when tests fail. SilenceUsage: true, @@ -298,26 +321,7 @@ COCKROACH_ environment variables in the local environment are passed through to the cluster nodes on start. `, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - promPort: promPort, - parallelism: parallelism, - artifactsDir: artifacts, - literalArtifactsDir: literalArtifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, false /* benchOnly */) + return runFn(args, false /* benchOnly */) }, } @@ -341,24 +345,7 @@ the cluster nodes on start. Short: "run automated benchmarks on cockroach cluster", Long: `Run automated benchmarks on existing or ephemeral cockroach clusters.`, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - parallelism: parallelism, - artifactsDir: artifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, true /* benchOnly */) + return runFn(args, true /* benchOnly */) }, } @@ -498,10 +485,11 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro opt := clustersOpt{ typ: clusterType, clusterName: clusterName, - user: getUser(cfg.user), - cpuQuota: cfg.cpuQuota, - debugMode: cfg.debugMode, - clusterID: cfg.clusterID, + // Precedence for resolving the user: cli arg, env.ROACHPROD_USER, current user. + user: cfg.user, + cpuQuota: cfg.cpuQuota, + debugMode: cfg.debugMode, + clusterID: cfg.clusterID, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 32d2d2d46dd2..05f27f15997a 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -70,8 +70,16 @@ var ( prometheusScrapeInterval = time.Second * 15 prng, _ = randutil.NewLockedPseudoRand() + + runID string ) +// VmLabelTestName is the label used to identify the test name in the VM metadata +const VmLabelTestName string = "test_name" + +// VmLabelTestRunID is the label used to identify the test run id in the VM metadata +const VmLabelTestRunID string = "test_run_id" + // testRunner runs tests. type testRunner struct { stopper *stop.Stopper @@ -296,6 +304,8 @@ func (r *testRunner) Run( qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota)) l := lopt.l + runID = generateRunID(clustersOpt.user) + shout(ctx, l, lopt.stdout, "%s: %s", VmLabelTestRunID, runID) var wg sync.WaitGroup for i := 0; i < parallelism; i++ { @@ -386,6 +396,16 @@ func numConcurrentClusterCreations() int { return res } +// This will be added as a label to all cluster nodes when the +// cluster is registered. +func generateRunID(user string) string { + uniqueId := os.Getenv("TC_BUILD_ID") + if uniqueId == "" { + uniqueId = fmt.Sprintf("%d", timeutil.Now().Unix()) + } + return fmt.Sprintf("%s-%s", user, uniqueId) +} + // defaultClusterAllocator is used by workers to create new clusters (or to attach // to an existing one). // @@ -912,10 +932,10 @@ func (r *testRunner) runTest( s := t.Spec().(*registry.TestSpec) _ = c.addLabels(map[string]string{ - "test_name": s.Name, + VmLabelTestName: s.Name, }) defer func() { - _ = c.removeLabels([]string{"test_name"}) + _ = c.removeLabels([]string{VmLabelTestName}) t.end = timeutil.Now() // We only have to record panics if the panic'd value is not the sentinel diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index cb2b93d4a273..be04d5ebed2e 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -647,6 +647,7 @@ func DNSSafeAccount(account string) string { return strings.Map(safe, account) } +// SanitizeLabel returns a version of the string that can be used as a label. func SanitizeLabel(label string) string { // Replace any non-alphanumeric characters with hyphens re := regexp.MustCompile("[^a-zA-Z0-9]+")