From 8859c7685a438a6a508f5adc4fab070fa08952f9 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Fri, 18 Nov 2022 12:06:33 -0500 Subject: [PATCH] roachtest: wait for upgrade to complete using retry loop in tpcc Previously, the `tpcc/mixed-headroom` roachtests would reset the `preserve_downgrade_option` setting and then wait for the upgrade to finish by running a `SET CLUSTER SETTING version = '...'` statement. However, that is not reliable as it's possible for that statement to return an error if the resetting of the `preserve_downgrade_option` has not been propagated yet (see #87201). To avoid this type of flake (which has been observed in manual runs), we use a retry loop waiting for the cluster version to converge, as is done by the majority of upgrade-related roachtests. Epic: None. Release note: None --- pkg/cmd/roachtest/tests/tpcc.go | 2 +- pkg/cmd/roachtest/tests/versionupgrade.go | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index fd5808c175b5..fe4a73a35c15 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -471,7 +471,7 @@ func runTPCCMixedHeadroom( // to get a better idea of what errors come back here, if any. // This will block until the long-running migrations have run. allowAutoUpgradeStep(randomCRDBNode()), - setClusterSettingVersionStep, + waitForUpgradeStep(crdbNodes), // Wait until TPCC background run terminates // and fail if it reports an error. tpccWorkload.wait, diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 1e6d52a42ef3..295164b9ee65 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -522,19 +522,6 @@ func waitForUpgradeStep(nodes option.NodeListOption) versionStep { } } -func setClusterSettingVersionStep(ctx context.Context, t test.Test, u *versionUpgradeTest) { - db := u.conn(ctx, t, 1) - t.L().Printf("bumping cluster version") - // TODO(tbg): once this is using a job, poll and periodically print the job status - // instead of blocking. - if _, err := db.ExecContext( - ctx, `SET CLUSTER SETTING version = crdb_internal.node_executable_version()`, - ); err != nil { - t.Fatal(err) - } - t.L().Printf("cluster version bumped") -} - type versionFeatureTest struct { name string fn func(context.Context, test.Test, *versionUpgradeTest, option.NodeListOption) (skipped bool)