Skip to content

Commit

Permalink
kvserver,clusterversion: version-gate lease upgrade logic
Browse files Browse the repository at this point in the history
Informs #88301. EnableLeaseUpgrade version gates a change in the lease
transfer protocol whereby we only ever transfer expiration-based leases
(and have recipients later upgrade them to the more efficient epoch
based ones). This was done to limit the effects of ill-advised lease
transfers since the incoming leaseholder would need to recognize itself
as such within a few seconds. This needs version gating so that in
mixed-version clusters, as part of lease transfers, we don't start
sending out expiration based leases to nodes that (i) don't expect them
for certain keyspans, and (ii) don't know to upgrade them to efficient
epoch-based ones.

While here, we also introduce a (hidden, default=true) cluster setting
kv.transfer_expiration_leases_first.enabled to feature gate this
protocol change.
  • Loading branch information
irfansharif committed Sep 23, 2022
1 parent 6450a9d commit fc2d180
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using
trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.
trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https://<ui>/#/debug/tracez
trace.zipkin.collector string the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.
version version 1000022.1-70 set the active cluster version in the format '<major>.<minor>'
version version 1000022.1-72 set the active cluster version in the format '<major>.<minor>'
2 changes: 1 addition & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,6 @@
<tr><td><code>trace.opentelemetry.collector</code></td><td>string</td><td><code></code></td><td>address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.</td></tr>
<tr><td><code>trace.span_registry.enabled</code></td><td>boolean</td><td><code>true</code></td><td>if set, ongoing traces can be seen at https://<ui>/#/debug/tracez</td></tr>
<tr><td><code>trace.zipkin.collector</code></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.</td></tr>
<tr><td><code>version</code></td><td>version</td><td><code>1000022.1-70</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
<tr><td><code>version</code></td><td>version</td><td><code>1000022.1-72</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
</tbody>
</table>
14 changes: 14 additions & 0 deletions pkg/clusterversion/cockroach_versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ const (
// version is enabled, the receiver will look at the priority of snapshots
// using the fields added in 22.2.
PrioritizeSnapshots
// EnableLeaseUpgrade version gates a change in the lease transfer protocol
// whereby we only ever transfer expiration-based leases (and have
// recipients later upgrade them to the more efficient epoch based ones).
// This was done to limit the effects of ill-advised lease transfers since
// the incoming leaseholder would need to recognize itself as such within a
// few seconds. This needs version gating so that in mixed-version clusters,
// as part of lease transfers, we don't start sending out expiration based
// leases to nodes that (i) don't expect them for certain keyspans, and (ii)
// don't know to upgrade them to efficient epoch-based ones.
EnableLeaseUpgrade

// *************************************************
// Step (1): Add new versions here.
Expand Down Expand Up @@ -467,6 +477,10 @@ var rawVersionsSingleton = keyedVersions{
Key: PrioritizeSnapshots,
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 70},
},
{
Key: EnableLeaseUpgrade,
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 72},
},

// *************************************************
// Step (2): Add new versions here.
Expand Down
73 changes: 72 additions & 1 deletion pkg/kv/kvserver/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/keys"
Expand Down Expand Up @@ -1364,7 +1365,6 @@ func TestLeaseTransfersUseExpirationLeasesAndBumpToEpochBasedOnes(t *testing.T)
if !li.Current().OwnedBy(n2.GetFirstStoreID()) {
return errors.New("lease still owned by n1")
}
require.Equal(t, roachpb.LeaseExpiration, li.Current().Type())
return nil
})

Expand All @@ -1376,3 +1376,74 @@ func TestLeaseTransfersUseExpirationLeasesAndBumpToEpochBasedOnes(t *testing.T)
defer mu.Unlock()
require.Equal(t, roachpb.LeaseExpiration, mu.lease.Type())
}

// TestLeaseUpgradeVersionGate tests the version gating for the lease-upgrade
// process.
//
// TODO(irfansharif): Delete this in 23.1 (or whenever we get rid of the
// clusterversion.EnableLeaseUpgrade).
func TestLeaseUpgradeVersionGate(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
st := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(clusterversion.EnableLeaseUpgrade-1),
false, /* initializeVersion */
)
tci := serverutils.StartNewTestCluster(t, 2, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
Settings: st,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(clusterversion.EnableLeaseUpgrade - 1),
},
},
},
})
tc := tci.(*testcluster.TestCluster)
defer tc.Stopper().Stop(ctx)

scratchKey := tc.ScratchRange(t)
n1, n1Target := tc.Server(0), tc.Target(0)
n2, n2Target := tc.Server(1), tc.Target(1)

// Add a replica; we're going to move the lease to and from it below.
desc := tc.AddVotersOrFatal(t, scratchKey, n2Target)

// Transfer the lease from n1 to n2. It should be transferred as an
// epoch-based one since we've not upgraded past
// clusterversion.EnableLeaseUpgrade yet.
tc.TransferRangeLeaseOrFatal(t, desc, n2Target)
testutils.SucceedsSoon(t, func() error {
li, _, err := tc.FindRangeLeaseEx(ctx, desc, nil)
require.NoError(t, err)
if !li.Current().OwnedBy(n2.GetFirstStoreID()) {
return errors.New("lease still owned by n1")
}
require.Equal(t, roachpb.LeaseEpoch, li.Current().Type())
return nil
})

// Enable the version gate.
_, err := tc.Conns[0].ExecContext(ctx, `SET CLUSTER SETTING version = $1`,
clusterversion.ByKey(clusterversion.EnableLeaseUpgrade).String())
require.NoError(t, err)

// Transfer the lease back from n2 to n1. It should be transferred as an
// expiration-based lease that's later upgraded to an epoch based one now
// that we're past the version gate.
tc.TransferRangeLeaseOrFatal(t, desc, n1Target)
testutils.SucceedsSoon(t, func() error {
li, _, err := tc.FindRangeLeaseEx(ctx, desc, nil)
require.NoError(t, err)
if !li.Current().OwnedBy(n1.GetFirstStoreID()) {
return errors.New("lease still owned by n2")
}
return nil
})
tc.WaitForLeaseUpgrade(ctx, t, desc)
}
14 changes: 13 additions & 1 deletion pkg/kv/kvserver/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/constraint"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
Expand All @@ -65,6 +67,13 @@ import (
"github.com/cockroachdb/redact"
)

var transferExpirationLeasesFirstEnabled = settings.RegisterBoolSetting(
settings.SystemOnly,
"kv.transfer_expiration_leases_first.enabled",
"controls whether we transfer expiration-based leases that are later upgraded to epoch-based ones",
true,
)

var leaseStatusLogLimiter = func() *log.EveryN {
e := log.Every(15 * time.Second)
e.ShouldLog() // waste the first shot
Expand Down Expand Up @@ -246,7 +255,10 @@ func (p *pendingLeaseRequest) InitOrJoinRequest(
ProposedTS: &status.Now,
}

if p.repl.requiresExpiringLeaseRLocked() || transfer {
if p.repl.requiresExpiringLeaseRLocked() ||
(transfer &&
transferExpirationLeasesFirstEnabled.Get(&p.repl.store.ClusterSettings().SV) &&
p.repl.store.ClusterSettings().Version.IsActive(ctx, clusterversion.EnableLeaseUpgrade)) {
// In addition to ranges that unconditionally require expiration-based
// leases (node liveness and earlier), we also use them during lease
// transfers for all other ranges. After acquiring these expiration
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/logictest/logictestbase/logictestbase.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ var LogicTestConfigs = []TestClusterConfig{
NumNodes: 1,
OverrideDistSQLMode: "off",
BootstrapVersion: clusterversion.ByKey(clusterversion.V22_1),
BinaryVersion: clusterversion.ByKey(clusterversion.PrioritizeSnapshots), //TODO: switch to 22.2.
BinaryVersion: clusterversion.ByKey(clusterversion.EnableLeaseUpgrade), // TODO(dt): switch to 22.2.
DisableUpgrade: true,
DeclarativeCorpusCollection: true,
},
Expand Down

0 comments on commit fc2d180

Please sign in to comment.