From 8859c7685a438a6a508f5adc4fab070fa08952f9 Mon Sep 17 00:00:00 2001
From: Renato Costa <renato@cockroachlabs.com>
Date: Fri, 18 Nov 2022 12:06:33 -0500
Subject: [PATCH] roachtest: wait for upgrade to complete using retry loop in
 tpcc

Previously, the `tpcc/mixed-headroom` roachtests would reset the
`preserve_downgrade_option` setting and then wait for the upgrade to
finish by running a `SET CLUSTER SETTING version = '...'`
statement. However, that is not reliable as it's possible for that
statement to return an error if the resetting of the
`preserve_downgrade_option` has not been propagated yet (see #87201).

To avoid this type of flake (which has been observed in manual runs),
we use a retry loop waiting for the cluster version to converge, as is
done by the majority of upgrade-related roachtests.

Epic: None.
Release note: None
---
 pkg/cmd/roachtest/tests/tpcc.go           |  2 +-
 pkg/cmd/roachtest/tests/versionupgrade.go | 13 -------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go
index fd5808c175b5..fe4a73a35c15 100644
--- a/pkg/cmd/roachtest/tests/tpcc.go
+++ b/pkg/cmd/roachtest/tests/tpcc.go
@@ -471,7 +471,7 @@ func runTPCCMixedHeadroom(
 			// to get a better idea of what errors come back here, if any.
 			// This will block until the long-running migrations have run.
 			allowAutoUpgradeStep(randomCRDBNode()),
-			setClusterSettingVersionStep,
+			waitForUpgradeStep(crdbNodes),
 			// Wait until TPCC background run terminates
 			// and fail if it reports an error.
 			tpccWorkload.wait,
diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go
index 1e6d52a42ef3..295164b9ee65 100644
--- a/pkg/cmd/roachtest/tests/versionupgrade.go
+++ b/pkg/cmd/roachtest/tests/versionupgrade.go
@@ -522,19 +522,6 @@ func waitForUpgradeStep(nodes option.NodeListOption) versionStep {
 	}
 }
 
-func setClusterSettingVersionStep(ctx context.Context, t test.Test, u *versionUpgradeTest) {
-	db := u.conn(ctx, t, 1)
-	t.L().Printf("bumping cluster version")
-	// TODO(tbg): once this is using a job, poll and periodically print the job status
-	// instead of blocking.
-	if _, err := db.ExecContext(
-		ctx, `SET CLUSTER SETTING version = crdb_internal.node_executable_version()`,
-	); err != nil {
-		t.Fatal(err)
-	}
-	t.L().Printf("cluster version bumped")
-}
-
 type versionFeatureTest struct {
 	name string
 	fn   func(context.Context, test.Test, *versionUpgradeTest, option.NodeListOption) (skipped bool)