Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
104821: upgrade: checkpoint fence version for multi-tenant clusters r=JeffSwenson a=JeffSwenson

Previously, the tenant upgrade interlock checkpointed the version attached to the upgrade, when it should have checkpointed the fence version. This made it possible for an upgrade to skip versions. The conditions needed for this are rare, but could be triggered by crashes in the upgrade system or by the autoscaler downscaling a serverless cluster with multiple nodes.

Release Note: None

Fixes: cockroachdb#104884

104876: logic: keep cockroach-go logs in case of failure r=rafiss a=renatolabs

This commit makes use of the new `CockroachLogsDirOpt` option introduced in [1] to pass a custom directory where cockroach logs will be stored. This directory is relative to TMPDIR, which is part of the artifacts path when the test runs in CI. With this change, when a logic test using the cockroach-go config fails, we'll be able to see both the test logs as well as the cockroach logs themselves. The latter will be in the `cockroach.stderr` file.

[1] cockroachdb/cockroach-go#170

Epic: none

Release note: None

104879: upgrades: deflake TestBackfillJobsInfoTable r=stevendanna a=adityamaru

Previously, this test created jobs that could be adopted during test execution causing the number of payload/progress updates to be variable. With this change we create startable jobs so that they are not adopted and the number of payload/progress remain constant.

Fixes: cockroachdb#103046
Release note: None

Co-authored-by: Jeff <[email protected]>
Co-authored-by: Renato Costa <[email protected]>
Co-authored-by: adityamaru <[email protected]>
  • Loading branch information
4 people committed Jun 14, 2023
4 parents 8e3bf5b + 4f2eca6 + 8069f3a + f523899 commit 3441110
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 21 deletions.
6 changes: 3 additions & 3 deletions DEPS.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1482,10 +1482,10 @@ def go_deps():
name = "com_github_cockroachdb_cockroach_go_v2",
build_file_proto_mode = "disable_global",
importpath = "github.com/cockroachdb/cockroach-go/v2",
sha256 = "25ae716dc921dce8336555cbc52b98243b8f1e5e33716afcd351cfd9c2538777",
strip_prefix = "github.com/cockroachdb/cockroach-go/[email protected].3",
sha256 = "70860a2615f3df73f7d00b4801b1fffe30a3306ae1cda1e9bf5245bb74e86d9a",
strip_prefix = "github.com/cockroachdb/cockroach-go/[email protected].5",
urls = [
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/cockroach-go/v2/com_github_cockroachdb_cockroach_go_v2-v2.3.3.zip",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/cockroach-go/v2/com_github_cockroachdb_cockroach_go_v2-v2.3.5.zip",
],
)
go_repository(
Expand Down
2 changes: 1 addition & 1 deletion build/bazelutil/distdir_files.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ DISTDIR_FILES = {
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/bubbletea/com_github_cockroachdb_bubbletea-v0.23.1-bracketed-paste2.zip": "d7916a0e7d8d814566e8f8d162c3764aea947296396a0a669564ff3ee53414bc",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/circuitbreaker/com_github_cockroachdb_circuitbreaker-v2.2.2-0.20190114160014-a614b14ccf63+incompatible.zip": "52fdb5ba6a60e9a2f1db42d5b3c4c13cc5bb3947d5ce7f1bba9b0a14de71813a",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/cmux/com_github_cockroachdb_cmux-v0.0.0-20170110192607-30d10be49292.zip": "88f6f9cf33eb535658540b46f6222f029398e590a3ff9cc873d7d561ac6debf0",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/cockroach-go/v2/com_github_cockroachdb_cockroach_go_v2-v2.3.3.zip": "25ae716dc921dce8336555cbc52b98243b8f1e5e33716afcd351cfd9c2538777",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/cockroach-go/v2/com_github_cockroachdb_cockroach_go_v2-v2.3.5.zip": "70860a2615f3df73f7d00b4801b1fffe30a3306ae1cda1e9bf5245bb74e86d9a",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/crlfmt/com_github_cockroachdb_crlfmt-v0.0.0-20221214225007-b2fc5c302548.zip": "fedc01bdd6d964da0425d5eaac8efadc951e78e13f102292cc0774197f09ab63",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/datadriven/com_github_cockroachdb_datadriven-v1.0.3-0.20230413201302-be42291fc80f.zip": "f6de9c180e1ea80c602d98247b8e8fe89f491648ab1425417b9aca082697cbc0",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/errors/com_github_cockroachdb_errors-v1.9.1.zip": "eeec411c969fb66b63af83527677cb84cceb50ed5003c51209e6bd76f1103a0b",
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ require (
github.com/cockroachdb/apd/v3 v3.2.0
github.com/cockroachdb/circuitbreaker v2.2.2-0.20190114160014-a614b14ccf63+incompatible
github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292
github.com/cockroachdb/cockroach-go/v2 v2.3.3
github.com/cockroachdb/cockroach-go/v2 v2.3.5
github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548
github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f
github.com/cockroachdb/errors v1.9.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -469,8 +469,8 @@ github.com/cockroachdb/circuitbreaker v2.2.2-0.20190114160014-a614b14ccf63+incom
github.com/cockroachdb/circuitbreaker v2.2.2-0.20190114160014-a614b14ccf63+incompatible/go.mod h1:v3T8+rm/HmCL0D1BwDcGaHHAQDuFPW7EsnYs2nBRqUo=
github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292 h1:dzj1/xcivGjNPwwifh/dWTczkwcuqsXXFHY1X/TZMtw=
github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292/go.mod h1:qRiX68mZX1lGBkTWyp3CLcenw9I94W2dLeRvMzcn9N4=
github.com/cockroachdb/cockroach-go/v2 v2.3.3 h1:fNmtG6XhoA1DhdDCIu66YyGSsNb1szj4CaAsbDxRmy4=
github.com/cockroachdb/cockroach-go/v2 v2.3.3/go.mod h1:1wNJ45eSXW9AnOc3skntW9ZUZz6gxrQK3cOj3rK+BC8=
github.com/cockroachdb/cockroach-go/v2 v2.3.5 h1:Khtm8K6fTTz/ZCWPzU9Ne3aOW9VyAnj4qIPCJgKtwK0=
github.com/cockroachdb/cockroach-go/v2 v2.3.5/go.mod h1:1wNJ45eSXW9AnOc3skntW9ZUZz6gxrQK3cOj3rK+BC8=
github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548 h1:i0bnjanlWAvM50wHMT7EFyxlt5HQusznWrkwl+HBIsU=
github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548/go.mod h1:qtkxNlt5i3rrdirfJE/bQeW/IeLajKexErv7jEIV+Uc=
github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8=
Expand Down
17 changes: 15 additions & 2 deletions pkg/sql/logictest/logic.go
Original file line number Diff line number Diff line change
Expand Up @@ -1254,14 +1254,27 @@ var _ = ((*logicTest)(nil)).newTestServerCluster
// bootstrapBinaryPath is given by the config's CockroachGoBootstrapVersion.
// upgradeBinaryPath is given by the config's CockroachGoUpgradeVersion, or
// is the locally built version if CockroachGoUpgradeVersion was not specified.
func (t *logicTest) newTestServerCluster(bootstrapBinaryPath string, upgradeBinaryPath string) {
func (t *logicTest) newTestServerCluster(bootstrapBinaryPath, upgradeBinaryPath string) {
logsDir, err := os.MkdirTemp("", "cockroach-logs*")
if err != nil {
t.Fatal(err)
}
cleanupLogsDir := func() {
if t.rootT.Failed() {
fmt.Fprintf(os.Stderr, "cockroach logs captured in: %s\n", logsDir)
} else {
_ = os.RemoveAll(logsDir)
}
}

// During config initialization, NumNodes is required to be 3.
opts := []testserver.TestServerOpt{
testserver.ThreeNodeOpt(),
testserver.StoreOnDiskOpt(),
testserver.CockroachBinaryPathOpt(bootstrapBinaryPath),
testserver.UpgradeCockroachBinaryPathOpt(upgradeBinaryPath),
testserver.PollListenURLTimeoutOpt(120),
testserver.CockroachLogsDirOpt(logsDir),
}
if strings.Contains(upgradeBinaryPath, "cockroach-short") {
// If we're using a cockroach-short binary, that means it was
Expand All @@ -1284,7 +1297,7 @@ func (t *logicTest) newTestServerCluster(bootstrapBinaryPath string, upgradeBina
}

t.testserverCluster = ts
t.clusterCleanupFuncs = append(t.clusterCleanupFuncs, ts.Stop)
t.clusterCleanupFuncs = append(t.clusterCleanupFuncs, ts.Stop, cleanupLogsDir)

t.setUser(username.RootUser, 0 /* nodeIdx */)
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/upgrade/upgrademanager/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,29 @@ go_test(
args = ["-test.timeout=295s"],
deps = [
"//pkg/base",
"//pkg/ccl/kvccl/kvtenantccl",
"//pkg/clusterversion",
"//pkg/jobs",
"//pkg/jobs/jobspb",
"//pkg/kv",
"//pkg/kv/kvserver/batcheval",
"//pkg/kv/kvserver/liveness",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
"//pkg/server",
"//pkg/server/settingswatcher",
"//pkg/settings/cluster",
"//pkg/sql/execinfra",
"//pkg/sql/isql",
"//pkg/sql/sem/eval",
"//pkg/testutils",
"//pkg/testutils/serverutils",
"//pkg/testutils/sqlutils",
"//pkg/testutils/testcluster",
"//pkg/upgrade",
"//pkg/upgrade/upgradebase",
"//pkg/upgrade/upgrades",
"//pkg/util",
"//pkg/util/leaktest",
"//pkg/util/log",
Expand Down
2 changes: 1 addition & 1 deletion pkg/upgrade/upgrademanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ func (m *Manager) Migrate(
if mustPersistFenceVersion {
var err error
for {
err = updateSystemVersionSetting(ctx, cv, validate)
err = updateSystemVersionSetting(ctx, fenceVersion, validate)
if errors.Is(err, upgradecluster.InconsistentSQLServersError) {
if err := bumpClusterVersion(ctx, m.deps.Cluster, fenceVersion); err != nil {
return err
Expand Down
131 changes: 131 additions & 0 deletions pkg/upgrade/upgrademanager/manager_external_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,34 @@ import (
"context"
gosql "database/sql"
"fmt"
"math/rand"
"sort"
"sync/atomic"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/server/settingswatcher"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/upgrade"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgrades"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
Expand Down Expand Up @@ -618,6 +625,7 @@ func TestPrecondition(t *testing.T) {
require.Equalf(t, exp, got, "server %d", i)
}
}

defer tc.Stopper().Stop(ctx)
sqlDB := tc.ServerConn(0)
{
Expand Down Expand Up @@ -662,3 +670,126 @@ func TestPrecondition(t *testing.T) {
checkActiveVersion(t, v2)
}
}

func TestMigrationFailure(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()

// Configure the range of versions used by the test
startVersionKey := clusterversion.BinaryMinSupportedVersionKey
startVersion := clusterversion.ByKey(startVersionKey)
endVersionKey := clusterversion.BinaryVersionKey
endVersion := clusterversion.ByKey(endVersionKey)

// Pick a random version in to fail at
versions := clusterversion.ListBetween(startVersion, endVersion)
failVersion := versions[rand.Intn(len(versions))]
fenceVersion := upgrade.FenceVersionFor(ctx, clusterversion.ClusterVersion{Version: failVersion}).Version
t.Logf("test will fail at version: %s", failVersion.String())

// Create a storage cluster for the tenant
testCluster := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
DefaultTestTenant: base.TestTenantDisabled,
Knobs: base.TestingKnobs{
SQLEvalContext: &eval.TestingKnobs{
TenantLogicalVersionKeyOverride: startVersionKey,
},
},
},
})
defer testCluster.Stopper().Stop(ctx)

// Set the version override so that the tenant is able to upgrade. If this is
// not set, the tenant treats the storage cluster as if it had the oldest
// supported binary version.
s := testCluster.Server(0)
goDB := serverutils.OpenDBConn(t, s.ServingSQLAddr(), "system", false, s.Stopper())
_, err := goDB.Exec(`ALTER TENANT ALL SET CLUSTER SETTING version = $1`, endVersion.String())
require.NoError(t, err)

// setting failUpgrade to false disables the upgrade error logic.
var failUpgrade atomic.Bool
failUpgrade.Store(true)

// Create a tenant cluster at the oldest supported binary version. Configure
// upgrade manager to fail the upgrade at a random version.
tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
endVersion,
startVersion,
false,
)
require.NoError(t, clusterversion.Initialize(ctx, startVersion, &tenantSettings.SV))
tenant, db := serverutils.StartTenant(t, testCluster.Server(0), base.TestTenantArgs{
TenantID: roachpb.MustMakeTenantID(10),
Settings: tenantSettings,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BootstrapVersionKeyOverride: startVersionKey,
BinaryVersionOverride: startVersion,
},
UpgradeManager: &upgradebase.TestingKnobs{
DontUseJobs: true,
RegistryOverride: func(cv roachpb.Version) (upgradebase.Upgrade, bool) {
if failUpgrade.Load() && cv == failVersion {
errorUpgrade := func(ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps) error {
return errors.New("the upgrade failed with some error!")
}
return upgrade.NewTenantUpgrade("test", cv, nil, errorUpgrade), true
}
return upgrades.GetUpgrade(cv)
},
},
},
})

// Wait for the storage cluster version to propogate
watcher := tenant.SettingsWatcher().(*settingswatcher.SettingsWatcher)
testutils.SucceedsSoon(t, func() error {
storageVersion := watcher.GetStorageClusterActiveVersion()
if !storageVersion.IsActive(endVersionKey) {
return errors.Newf("expected storage version of at least %s found %s", endVersion.PrettyPrint(), storageVersion.PrettyPrint())
}
return nil
})

checkActiveVersion := func(t *testing.T, exp roachpb.Version) {
got := tenant.ClusterSettings().Version.ActiveVersion(ctx).Version
require.Equal(t, exp, got)
}
checkSettingVersion := func(t *testing.T, exp roachpb.Version) {
var settingVersion clusterversion.ClusterVersion
require.NoError(t, tenant.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
var err error
settingVersion, err = watcher.GetClusterVersionFromStorage(ctx, txn)
return err
}))
require.Equal(t, exp, settingVersion.Version)
}

checkActiveVersion(t, startVersion)
checkSettingVersion(t, startVersion)

// Try to finalize
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.Error(t, err)
checkActiveVersion(t, fenceVersion)
// Note: we don't check the setting version here because the fence setting
// is only materialized on the first attempt.

// Try to finalize again.
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.Error(t, err)
checkActiveVersion(t, fenceVersion)
checkSettingVersion(t, fenceVersion)

// Try to finalize a final time, allowing the upgrade to succeed.
failUpgrade.Store(false)
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.NoError(t, err)
checkActiveVersion(t, endVersion)
checkSettingVersion(t, endVersion)
}
2 changes: 2 additions & 0 deletions pkg/upgrade/upgrades/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ go_test(
shard_count = 16,
deps = [
"//pkg/base",
"//pkg/ccl/backupccl",
"//pkg/ccl/changefeedccl",
"//pkg/cloud/userfile",
"//pkg/clusterversion",
"//pkg/jobs",
Expand Down
21 changes: 10 additions & 11 deletions pkg/upgrade/upgrades/backfill_job_info_table_migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ import (
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
_ "github.com/cockroachdb/cockroach/pkg/ccl/backupccl"
_ "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgrades"
Expand Down Expand Up @@ -53,25 +57,20 @@ func TestBackfillJobsInfoTable(t *testing.T) {
r := tc.Server(0).JobRegistry().(*jobs.Registry)
sqlDB := sqlutils.MakeSQLRunner(tc.ServerConn(0))
defer tc.Stopper().Stop(ctx)
jobspb.ForEachType(func(typ jobspb.Type) {
// The upgrade creates migration and schemachange jobs, so we do not
// need to create more. We should not override resumers for these job types,
// otherwise the upgrade will hang.
if typ != jobspb.TypeMigration && typ != jobspb.TypeSchemaChange {
r.TestingWrapResumerConstructor(typ, func(r jobs.Resumer) jobs.Resumer { return &fakeResumer{} })
}
}, false)

createJob := func(id jobspb.JobID, details jobspb.Details, progress jobspb.ProgressDetails) *jobs.Job {
createJob := func(id jobspb.JobID, details jobspb.Details, progress jobspb.ProgressDetails) {
defaultRecord := jobs.Record{
Details: details,
Progress: progress,
Username: username.TestUserName(),
}

job, err := r.CreateJobWithTxn(ctx, defaultRecord, id, nil /* txn */)
var job *jobs.StartableJob
db := tc.Server(0).InternalDB().(*sql.InternalDB)
err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
return r.CreateStartableJobWithTxn(ctx, &job, id, txn, defaultRecord)
})
require.NoError(t, err)
return job
}

// Create a few different types of jobs.
Expand Down

0 comments on commit 3441110

Please sign in to comment.