From c52b01ff57e3540a58b6c582cab287896d8e6284 Mon Sep 17 00:00:00 2001 From: Aayush Shah Date: Wed, 7 Apr 2021 04:24:26 -0400 Subject: [PATCH] kvserver: perform initial upreplication of non-voters synchronously Resolves https://github.com/cockroachdb/cockroach/issues/63199 Before this commit, we relied on the raft snapshot queue to asynchronously perform the initial upreplication of non-voting replicas. This meant that by the time `AdminChangeReplicas` (and consequently, `AdminRelocateRange`) returned to its client, non-voters were not guaranteed to have been initialized. This was a deliberate decision and was, thus far, believed to be copacetic. However, this decision subtly made range merges (of ranges that have any number of non-voters) extremely unlikely to suceed, while causing severe disruption on foreground traffic on the right hand side of a merge. This was because the `mergeQueue` will first call `AdminRelocateRange` on the right hand side range in order to collocate its replicas with the replicas of the left hand side range. If the `mergeQueue` happened to relocate any non-voting replicas, they were likely to still be waiting for their initial snapshot by the time the `AdminMerge` attempted to subsume the RHS. Essentially, this meant that we were subsuming the RHS of a merge while some of its replicas weren't even initialized. This would cause the merge to fail and, in the interim, block all traffic over the RHS range for a 5 second window. This commit fixes the unfortunate sequence of events described above by making the behavior of `AdminChangeReplicas` more symmetric for voting and non-voting replicas. Now, if `AdminChangeReplicas` successfully returns, its client can safely assume that all new replicas have at least been upreplicated via an initial snapshot. Release note: None --- .../kvfollowerreadsccl/followerreads_test.go | 18 - pkg/kv/kvserver/client_merge_test.go | 73 ++++ pkg/kv/kvserver/client_migration_test.go | 2 +- pkg/kv/kvserver/client_split_test.go | 17 +- pkg/kv/kvserver/merge_queue.go | 4 + pkg/kv/kvserver/raft.pb.go | 204 +++++----- pkg/kv/kvserver/raft.proto | 10 +- pkg/kv/kvserver/raft_log_queue.go | 12 +- pkg/kv/kvserver/raft_log_queue_test.go | 12 +- pkg/kv/kvserver/raft_snapshot_queue.go | 21 +- pkg/kv/kvserver/replica_command.go | 349 +++++++++++------- pkg/kv/kvserver/replica_learner_test.go | 156 +++++--- pkg/kv/kvserver/replica_raft.go | 2 +- pkg/kv/kvserver/testing_knobs.go | 27 +- pkg/roachpb/metadata_replicas.go | 30 +- .../serverutils/test_cluster_shim.go | 3 +- 16 files changed, 577 insertions(+), 363 deletions(-) diff --git a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go index 5a0203e1f8b6..22f94834f8e7 100644 --- a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go +++ b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go @@ -578,24 +578,6 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) { n1.Exec(t, `ALTER TABLE test EXPERIMENTAL_RELOCATE VOTERS VALUES (ARRAY[1], 1)`) n1.Exec(t, `ALTER TABLE test EXPERIMENTAL_RELOCATE NON_VOTERS VALUES (ARRAY[3], 1)`) - // Wait until the new non-voter is upreplicated to n3. - testutils.SucceedsSoon( - t, func() error { - return tc.Server(2).GetStores().(*kvserver.Stores).VisitStores( - func(s *kvserver.Store) error { - repl := s.LookupReplica(tablePrefix) - if repl == nil { - return errors.Errorf("no replica found on store %s", s) - } - if !repl.IsInitialized() { - return errors.Errorf("non-voter not initialized") - } - return nil - }, - ) - }, - ) - // Execute the query again and assert the cache is updated. This query will // not be executed as a follower read since it attempts to use n2 which // doesn't have a replica any more and then it tries n1 which returns an diff --git a/pkg/kv/kvserver/client_merge_test.go b/pkg/kv/kvserver/client_merge_test.go index 10e6954f1da5..08283a4b817d 100644 --- a/pkg/kv/kvserver/client_merge_test.go +++ b/pkg/kv/kvserver/client_merge_test.go @@ -4471,6 +4471,79 @@ func TestMergeQueueSeesNonVoters(t *testing.T) { } } +// TestMergeQueueWithSlowNonVoterSnaps aims to check that non-voting replicas +// are initially upreplicated through a synchronously-sent snapshot inside of +// `AdminChangeReplicas`, like voting replicas are. Otherwise, range merges +// could be allowed to proceed with subsuming the right-hand side range while it +// still has uninitialized non-voters. +// +// Regression test for https://github.com/cockroachdb/cockroach/issues/63199. +func TestMergeQueueWithSlowNonVoterSnaps(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + skip.UnderShort(t, "this test sleeps for a few seconds") + + ctx := context.Background() + var delaySnapshotTrap atomic.Value + var clusterArgs = base.TestClusterArgs{ + // We dont want the replicate queue mucking with our test, so disable it. + ReplicationMode: base.ReplicationManual, + ServerArgsPerNode: map[int]base.TestServerArgs{ + 1: { + Knobs: base.TestingKnobs{ + Store: &kvserver.StoreTestingKnobs{ + ReceiveSnapshot: func(header *kvserver.SnapshotRequest_Header) error { + val := delaySnapshotTrap.Load() + if val != nil { + fn := val.(func() error) + return fn() + } + return nil + }, + }, + }, + }, + }, + } + + dbName := "testdb" + tableName := "kv" + numNodes := 3 + tc, _ := setupTestClusterWithDummyRange(t, clusterArgs, dbName, tableName, numNodes) + defer tc.Stopper().Stop(ctx) + // We're controlling merge queue operation via + // `store.SetMergeQueueActive`, so enable the cluster setting here. + _, err := tc.ServerConn(0).Exec(`SET CLUSTER SETTING kv.range_merge.queue_enabled=true`) + require.NoError(t, err) + + store, err := tc.Server(0).GetStores().(*kvserver.Stores).GetStore(1) + require.Nil(t, err) + // We're going to split the dummy range created above with an empty + // expiration time. Disable the merge queue before splitting so that the + // split ranges aren't immediately merged. + store.SetMergeQueueActive(false) + leftDesc, rightDesc := splitDummyRangeInTestCluster( + t, tc, dbName, tableName, hlc.Timestamp{}, /* splitExpirationTime */ + ) + require.Equal(t, 1, len(leftDesc.Replicas().Descriptors())) + require.Equal(t, 1, len(rightDesc.Replicas().Descriptors())) + + // Add non-voters for the LHS and RHS on servers 1 and 2 respectively so that + // the merge queue logic has to explicitly relocate the RHS non-voter to + // server 1, in order to align replica sets to proceed with the merge. + tc.AddNonVotersOrFatal(t, leftDesc.StartKey.AsRawKey(), tc.Target(1)) + tc.AddNonVotersOrFatal(t, rightDesc.StartKey.AsRawKey(), tc.Target(2)) + + delaySnapshotTrap.Store(func() error { + time.Sleep(5 * time.Second) + return nil + }) + store.SetMergeQueueActive(true) + store.MustForceMergeScanAndProcess() + verifyMerged(t, store, leftDesc.StartKey, rightDesc.StartKey) +} + func TestInvalidSubsumeRequest(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) diff --git a/pkg/kv/kvserver/client_migration_test.go b/pkg/kv/kvserver/client_migration_test.go index 59d658dbfc72..714ef34cf8c7 100644 --- a/pkg/kv/kvserver/client_migration_test.go +++ b/pkg/kv/kvserver/client_migration_test.go @@ -186,7 +186,7 @@ func TestMigrateWithInflightSnapshot(t *testing.T) { if processErr != nil { return processErr } - const msg = `skipping snapshot; replica is likely a learner in the process of being added: (n2,s2):2LEARNER` + const msg = `skipping snapshot; replica is likely a LEARNER in the process of being added: (n2,s2):2LEARNER` formattedTrace := trace.String() if !strings.Contains(formattedTrace, msg) { return errors.Errorf(`expected "%s" in trace got:\n%s`, msg, formattedTrace) diff --git a/pkg/kv/kvserver/client_split_test.go b/pkg/kv/kvserver/client_split_test.go index 7bd40ae1e874..865eb4eb0b6e 100644 --- a/pkg/kv/kvserver/client_split_test.go +++ b/pkg/kv/kvserver/client_split_test.go @@ -3323,18 +3323,21 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { ctx := context.Background() blockPromoteCh := make(chan struct{}) - var skipLearnerSnaps int32 + var skipSnaps int32 withoutLearnerSnap := func(fn func()) { - atomic.StoreInt32(&skipLearnerSnaps, 1) + atomic.StoreInt32(&skipSnaps, 1) fn() - atomic.StoreInt32(&skipLearnerSnaps, 0) + atomic.StoreInt32(&skipSnaps, 0) } knobs := base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ - ReplicaSkipLearnerSnapshot: func() bool { - return atomic.LoadInt32(&skipLearnerSnaps) != 0 + ReplicaSkipInitialSnapshot: func() bool { + return atomic.LoadInt32(&skipSnaps) != 0 }, - ReplicaAddStopAfterLearnerSnapshot: func(targets []roachpb.ReplicationTarget) bool { - if atomic.LoadInt32(&skipLearnerSnaps) != 0 { + RaftSnapshotQueueSkipReplica: func() bool { + return atomic.LoadInt32(&skipSnaps) != 0 + }, + VoterAddStopAfterLearnerSnapshot: func(targets []roachpb.ReplicationTarget) bool { + if atomic.LoadInt32(&skipSnaps) != 0 { return false } if len(targets) > 0 && targets[0].StoreID == 2 { diff --git a/pkg/kv/kvserver/merge_queue.go b/pkg/kv/kvserver/merge_queue.go index 81c4bdd57078..a964e00fcdb7 100644 --- a/pkg/kv/kvserver/merge_queue.go +++ b/pkg/kv/kvserver/merge_queue.go @@ -352,6 +352,10 @@ func (mq *mergeQueue) process( } else if err != nil { // While range merges are unstable, be extra cautious and mark every error // as purgatory-worthy. + // + // TODO(aayush): Merges are indeed stable now, we can be smarter here about + // which errors should be marked as purgatory-worthy. + log.Warningf(ctx, "%v", err) return false, rangeMergePurgatoryError{err} } if testingAggressiveConsistencyChecks { diff --git a/pkg/kv/kvserver/raft.pb.go b/pkg/kv/kvserver/raft.pb.go index bf1cb8d484bc..2ca0f85a3fb0 100644 --- a/pkg/kv/kvserver/raft.pb.go +++ b/pkg/kv/kvserver/raft.pb.go @@ -53,7 +53,7 @@ func (x SnapshotRequest_Priority) String() string { return proto.EnumName(SnapshotRequest_Priority_name, int32(x)) } func (SnapshotRequest_Priority) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{5, 0} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{5, 0} } type SnapshotRequest_Strategy int32 @@ -77,7 +77,7 @@ func (x SnapshotRequest_Strategy) String() string { return proto.EnumName(SnapshotRequest_Strategy_name, int32(x)) } func (SnapshotRequest_Strategy) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{5, 1} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{5, 1} } // Type is used for metrics collection on the receiver side. See @@ -86,30 +86,30 @@ type SnapshotRequest_Type int32 const ( // VIA_SNAPSHOT_QUEUE indicates the snapshots sent by the raft snapshot - // queue. + // queue to all types of replicas. SnapshotRequest_VIA_SNAPSHOT_QUEUE SnapshotRequest_Type = 0 - // LEARNER_INITIAL indicates the initial snapshots sent to LEARNER replicas - // for upreplication, before they're promoted to full voters. + // INITIAL indicates the initial snapshots sent to LEARNER (before they're + // promoted to full voters) and NON_VOTER replicas for upreplication. // // As of the time of writing, we only send this snapshot from the - // execReplicationChangesForVoters after creating a new LEARNER replica. - SnapshotRequest_LEARNER_INITIAL SnapshotRequest_Type = 1 + // initializeRaftLearners after creating a new LEARNER or NON_VOTER replica. + SnapshotRequest_INITIAL SnapshotRequest_Type = 1 ) var SnapshotRequest_Type_name = map[int32]string{ 0: "VIA_SNAPSHOT_QUEUE", - 1: "LEARNER_INITIAL", + 1: "INITIAL", } var SnapshotRequest_Type_value = map[string]int32{ "VIA_SNAPSHOT_QUEUE": 0, - "LEARNER_INITIAL": 1, + "INITIAL": 1, } func (x SnapshotRequest_Type) String() string { return proto.EnumName(SnapshotRequest_Type_name, int32(x)) } func (SnapshotRequest_Type) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{5, 2} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{5, 2} } type SnapshotResponse_Status int32 @@ -141,7 +141,7 @@ func (x SnapshotResponse_Status) String() string { return proto.EnumName(SnapshotResponse_Status_name, int32(x)) } func (SnapshotResponse_Status) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{6, 0} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{6, 0} } // RaftHeartbeat is a request that contains the barebones information for a @@ -163,7 +163,7 @@ func (m *RaftHeartbeat) Reset() { *m = RaftHeartbeat{} } func (m *RaftHeartbeat) String() string { return proto.CompactTextString(m) } func (*RaftHeartbeat) ProtoMessage() {} func (*RaftHeartbeat) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{0} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{0} } func (m *RaftHeartbeat) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -239,7 +239,7 @@ func (m *RaftMessageRequest) Reset() { *m = RaftMessageRequest{} } func (m *RaftMessageRequest) String() string { return proto.CompactTextString(m) } func (*RaftMessageRequest) ProtoMessage() {} func (*RaftMessageRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{1} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{1} } func (m *RaftMessageRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -272,7 +272,7 @@ func (m *RaftMessageRequestBatch) Reset() { *m = RaftMessageRequestBatch func (m *RaftMessageRequestBatch) String() string { return proto.CompactTextString(m) } func (*RaftMessageRequestBatch) ProtoMessage() {} func (*RaftMessageRequestBatch) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{2} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{2} } func (m *RaftMessageRequestBatch) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -305,7 +305,7 @@ func (m *RaftMessageResponseUnion) Reset() { *m = RaftMessageResponseUni func (m *RaftMessageResponseUnion) String() string { return proto.CompactTextString(m) } func (*RaftMessageResponseUnion) ProtoMessage() {} func (*RaftMessageResponseUnion) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{3} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{3} } func (m *RaftMessageResponseUnion) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -348,7 +348,7 @@ func (m *RaftMessageResponse) Reset() { *m = RaftMessageResponse{} } func (m *RaftMessageResponse) String() string { return proto.CompactTextString(m) } func (*RaftMessageResponse) ProtoMessage() {} func (*RaftMessageResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{4} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{4} } func (m *RaftMessageResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -389,7 +389,7 @@ func (m *SnapshotRequest) Reset() { *m = SnapshotRequest{} } func (m *SnapshotRequest) String() string { return proto.CompactTextString(m) } func (*SnapshotRequest) ProtoMessage() {} func (*SnapshotRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{5} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{5} } func (m *SnapshotRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -451,7 +451,7 @@ func (m *SnapshotRequest_Header) Reset() { *m = SnapshotRequest_Header{} func (m *SnapshotRequest_Header) String() string { return proto.CompactTextString(m) } func (*SnapshotRequest_Header) ProtoMessage() {} func (*SnapshotRequest_Header) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{5, 0} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{5, 0} } func (m *SnapshotRequest_Header) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -485,7 +485,7 @@ func (m *SnapshotResponse) Reset() { *m = SnapshotResponse{} } func (m *SnapshotResponse) String() string { return proto.CompactTextString(m) } func (*SnapshotResponse) ProtoMessage() {} func (*SnapshotResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{6} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{6} } func (m *SnapshotResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -522,7 +522,7 @@ func (m *ConfChangeContext) Reset() { *m = ConfChangeContext{} } func (m *ConfChangeContext) String() string { return proto.CompactTextString(m) } func (*ConfChangeContext) ProtoMessage() {} func (*ConfChangeContext) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_6d6ea5455d4829c1, []int{7} + return fileDescriptor_raft_e5b26aa81c5a4a10, []int{7} } func (m *ConfChangeContext) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -2856,89 +2856,89 @@ var ( ErrIntOverflowRaft = fmt.Errorf("proto: integer overflow") ) -func init() { proto.RegisterFile("kv/kvserver/raft.proto", fileDescriptor_raft_6d6ea5455d4829c1) } +func init() { proto.RegisterFile("kv/kvserver/raft.proto", fileDescriptor_raft_e5b26aa81c5a4a10) } -var fileDescriptor_raft_6d6ea5455d4829c1 = []byte{ - // 1296 bytes of a gzipped FileDescriptorProto +var fileDescriptor_raft_e5b26aa81c5a4a10 = []byte{ + // 1289 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe4, 0x57, 0x4f, 0x73, 0xdb, 0x44, - 0x14, 0xb7, 0x12, 0xc5, 0x96, 0xd7, 0x71, 0x23, 0xb6, 0x21, 0x88, 0x4c, 0xb1, 0x83, 0xa6, 0x30, - 0xe1, 0x9f, 0x3c, 0xa4, 0x85, 0x19, 0xb8, 0x80, 0xff, 0xa8, 0x13, 0xc5, 0xa9, 0x93, 0xae, 0x9d, - 0x30, 0xc0, 0x80, 0x46, 0x96, 0xd7, 0xb6, 0xc6, 0xb6, 0x56, 0x95, 0xd6, 0x06, 0xf7, 0x53, 0x70, - 0xe5, 0xc6, 0x8d, 0x4f, 0xc2, 0x4c, 0x8e, 0x3d, 0xf6, 0xc0, 0x78, 0xc0, 0x3d, 0xf3, 0x05, 0x7a, - 0x62, 0x76, 0xb5, 0x72, 0xdc, 0xb4, 0x69, 0x9b, 0x19, 0x38, 0x30, 0x9c, 0xf2, 0xf6, 0xf9, 0xbd, - 0xdf, 0xdb, 0xf7, 0xde, 0xef, 0xbd, 0x55, 0xc0, 0xd6, 0x60, 0x52, 0x1a, 0x4c, 0x22, 0x1c, 0x4e, - 0x70, 0x58, 0x0a, 0x9d, 0x2e, 0x35, 0x82, 0x90, 0x50, 0x02, 0x5f, 0x77, 0x89, 0x3b, 0x08, 0x89, - 0xe3, 0xf6, 0x8d, 0xc1, 0xc4, 0x48, 0x2c, 0xb6, 0x37, 0xb9, 0x2a, 0x68, 0x97, 0x70, 0x18, 0x92, - 0x30, 0x8a, 0x8d, 0xb7, 0xb7, 0x12, 0xed, 0x08, 0x53, 0xa7, 0xe3, 0x50, 0x47, 0xe8, 0x8d, 0x65, - 0xf0, 0xa1, 0x37, 0xc1, 0x3e, 0x8e, 0xa2, 0x85, 0x10, 0xb4, 0x17, 0xa2, 0xb0, 0xd7, 0x97, 0xed, - 0x13, 0x21, 0x68, 0x97, 0x22, 0xea, 0x50, 0x2c, 0x6c, 0x0a, 0x98, 0xba, 0x1d, 0x7e, 0xd3, 0xd2, - 0xe4, 0x16, 0xff, 0x1b, 0xb4, 0x97, 0x2e, 0xbe, 0xbd, 0xd9, 0x23, 0x3d, 0xc2, 0xc5, 0x12, 0x93, - 0x62, 0xad, 0xfe, 0x97, 0x0c, 0xf2, 0xc8, 0xe9, 0xd2, 0x7d, 0xec, 0x84, 0xb4, 0x8d, 0x1d, 0x0a, - 0xbf, 0x07, 0x4a, 0xe8, 0xf8, 0x3d, 0x6c, 0x7b, 0x1d, 0x4d, 0xda, 0x91, 0x76, 0xe5, 0x4a, 0x75, - 0x3e, 0x2b, 0x66, 0x10, 0xd3, 0x59, 0xb5, 0x27, 0xb3, 0xe2, 0xed, 0x9e, 0x47, 0xfb, 0xe3, 0xb6, - 0xe1, 0x92, 0x51, 0x69, 0x51, 0x8c, 0x4e, 0xfb, 0x5c, 0x2e, 0x05, 0x83, 0x5e, 0x49, 0x64, 0x6e, - 0x08, 0x3f, 0x94, 0xe1, 0xa0, 0x56, 0x07, 0x46, 0x60, 0xa3, 0x1b, 0x92, 0x91, 0x1d, 0xe2, 0x60, - 0xe8, 0xb9, 0x0e, 0x0b, 0xb3, 0xb2, 0x23, 0xed, 0xe6, 0x2b, 0xf5, 0xf9, 0xac, 0x98, 0xbf, 0x13, - 0x92, 0x11, 0x8a, 0x7f, 0xe1, 0xc1, 0x3e, 0xbd, 0x5a, 0xb0, 0xc4, 0x13, 0xe5, 0xbb, 0x4b, 0x40, - 0x1d, 0x38, 0x02, 0x79, 0x4a, 0x96, 0x43, 0xae, 0xf2, 0x90, 0xd6, 0x7c, 0x56, 0xcc, 0xb5, 0xc8, - 0x3f, 0x11, 0x30, 0x47, 0xc9, 0x79, 0x38, 0x08, 0x64, 0x8a, 0xc3, 0x91, 0x26, 0xb3, 0xfa, 0x21, - 0x2e, 0xc3, 0x2d, 0x90, 0x76, 0xc9, 0x68, 0xe4, 0x51, 0x6d, 0x8d, 0x6b, 0xc5, 0x09, 0x6a, 0x20, - 0x73, 0x7f, 0xec, 0xe1, 0xc8, 0xc5, 0x5a, 0x7a, 0x47, 0xda, 0x55, 0x50, 0x72, 0x84, 0x0f, 0xc0, - 0x8d, 0xa1, 0xd3, 0xeb, 0x79, 0x7e, 0xcf, 0xee, 0x92, 0xe1, 0x90, 0xfc, 0x80, 0xc3, 0xc8, 0x26, - 0xbe, 0x9d, 0x98, 0x2b, 0x3b, 0xab, 0xbb, 0xb9, 0xbd, 0x5b, 0xc6, 0x73, 0x19, 0x69, 0x2c, 0x28, - 0x74, 0x4e, 0x2b, 0xe3, 0x50, 0x88, 0x15, 0xf9, 0x6c, 0x56, 0x4c, 0xa1, 0x37, 0x05, 0xfc, 0x9d, - 0x04, 0xfd, 0xc8, 0xbf, 0x27, 0x62, 0x1f, 0x83, 0x77, 0x5e, 0x14, 0xdb, 0x76, 0x5c, 0x77, 0x1c, - 0x3a, 0x14, 0x6b, 0x80, 0xdf, 0xf9, 0xed, 0x4b, 0x91, 0xca, 0xc2, 0xf0, 0x40, 0x56, 0x32, 0xaa, - 0xa2, 0xff, 0x9a, 0x06, 0x90, 0xf1, 0xed, 0x2e, 0x8e, 0x22, 0xa7, 0x87, 0x11, 0xbe, 0x3f, 0xc6, - 0xd1, 0xbf, 0x4f, 0xba, 0xbb, 0x60, 0x7d, 0x99, 0x74, 0x9c, 0x71, 0xb9, 0xbd, 0x9b, 0x4b, 0xa5, - 0xbb, 0xd0, 0xd1, 0x1a, 0x8e, 0xdc, 0xd0, 0x0b, 0x28, 0x09, 0x45, 0xad, 0x72, 0x4b, 0x84, 0x82, - 0x16, 0x00, 0xe7, 0x74, 0xe2, 0x5c, 0xba, 0x1a, 0x58, 0x76, 0x41, 0x16, 0x58, 0x02, 0x99, 0x51, - 0x5c, 0x0b, 0xce, 0x96, 0xdc, 0xde, 0x86, 0x11, 0xcf, 0xae, 0x21, 0x4a, 0x24, 0x5c, 0x12, 0xab, - 0x65, 0xbe, 0xac, 0x3d, 0xcd, 0x97, 0x03, 0x00, 0xfa, 0xc9, 0x18, 0x47, 0x5a, 0x9a, 0xb3, 0xe3, - 0xe6, 0x25, 0xec, 0x78, 0x6a, 0xe6, 0x45, 0x88, 0x25, 0x6f, 0xd8, 0x04, 0x1b, 0x8b, 0x93, 0x1d, - 0xe2, 0x28, 0x88, 0xb4, 0xcc, 0x95, 0x01, 0xaf, 0x2d, 0x20, 0x10, 0x43, 0x80, 0xdf, 0x81, 0x8d, - 0xb8, 0xcb, 0x11, 0x75, 0x42, 0x6a, 0x0f, 0xf0, 0x54, 0x53, 0x76, 0xa4, 0xdd, 0xf5, 0xca, 0x27, - 0x4f, 0x66, 0xc5, 0x8f, 0xaf, 0xd6, 0xe1, 0x3a, 0x9e, 0xa2, 0x3c, 0x47, 0x6b, 0x32, 0xb0, 0x3a, - 0x9e, 0xbe, 0x74, 0x5e, 0xb2, 0xff, 0xa5, 0x79, 0xd1, 0xbb, 0xe0, 0x8d, 0x67, 0x07, 0xa5, 0xe2, - 0x50, 0xb7, 0x0f, 0xeb, 0x40, 0x09, 0xe3, 0x73, 0xa4, 0x49, 0x3c, 0xa9, 0xf7, 0x5e, 0xd0, 0x95, - 0x0b, 0x08, 0x71, 0x2a, 0x0b, 0x00, 0xfd, 0x18, 0x68, 0x4f, 0x59, 0x45, 0x01, 0xf1, 0x23, 0x7c, + 0x14, 0xb7, 0x12, 0xc5, 0x96, 0xd7, 0x71, 0x23, 0x96, 0x10, 0x44, 0xa6, 0xd8, 0x41, 0x53, 0x98, + 0xf0, 0x4f, 0x9e, 0xa6, 0x85, 0x03, 0x17, 0xf0, 0x1f, 0x75, 0xa2, 0x38, 0x4d, 0xdc, 0xb5, 0x13, + 0x06, 0x18, 0xd0, 0xc8, 0xf2, 0xda, 0xd6, 0xd8, 0xd6, 0xaa, 0xd2, 0xda, 0xe0, 0x7e, 0x0a, 0xae, + 0xdc, 0xb8, 0xf1, 0x49, 0x98, 0xc9, 0xb1, 0x27, 0xa6, 0x07, 0xc6, 0x03, 0xee, 0x99, 0x2f, 0xd0, + 0x13, 0xb3, 0xab, 0x95, 0xe3, 0xa6, 0x4d, 0xdb, 0xcc, 0xc0, 0x81, 0xe1, 0x94, 0xb7, 0xcf, 0xef, + 0xfd, 0xde, 0xbe, 0xf7, 0x7e, 0xef, 0xad, 0x02, 0xb6, 0x06, 0x93, 0xd2, 0x60, 0x12, 0xe1, 0x70, + 0x82, 0xc3, 0x52, 0xe8, 0x74, 0xa9, 0x11, 0x84, 0x84, 0x12, 0xf8, 0x86, 0x4b, 0xdc, 0x41, 0x48, + 0x1c, 0xb7, 0x6f, 0x0c, 0x26, 0x46, 0x62, 0xb1, 0xbd, 0xc9, 0x55, 0x41, 0xbb, 0x84, 0xc3, 0x90, + 0x84, 0x51, 0x6c, 0xbc, 0xbd, 0x95, 0x68, 0x47, 0x98, 0x3a, 0x1d, 0x87, 0x3a, 0x42, 0x6f, 0x2c, + 0x83, 0x0f, 0xbd, 0x09, 0xf6, 0x71, 0x14, 0x2d, 0x84, 0xa0, 0xbd, 0x10, 0x85, 0xbd, 0xbe, 0x6c, + 0x9f, 0x08, 0x41, 0xbb, 0x14, 0x51, 0x87, 0x62, 0x61, 0x53, 0xc0, 0xd4, 0xed, 0xf0, 0x9b, 0x96, + 0x26, 0xb7, 0xf8, 0xdf, 0xa0, 0xbd, 0x74, 0xf1, 0xed, 0xcd, 0x1e, 0xe9, 0x11, 0x2e, 0x96, 0x98, + 0x14, 0x6b, 0xf5, 0xbf, 0x64, 0x90, 0x47, 0x4e, 0x97, 0xee, 0x63, 0x27, 0xa4, 0x6d, 0xec, 0x50, + 0xf8, 0x1d, 0x50, 0x42, 0xc7, 0xef, 0x61, 0xdb, 0xeb, 0x68, 0xd2, 0x8e, 0xb4, 0x2b, 0x57, 0xaa, + 0xf3, 0x59, 0x31, 0x83, 0x98, 0xce, 0xaa, 0x3d, 0x99, 0x15, 0x6f, 0xf7, 0x3c, 0xda, 0x1f, 0xb7, + 0x0d, 0x97, 0x8c, 0x4a, 0x8b, 0x62, 0x74, 0xda, 0xe7, 0x72, 0x29, 0x18, 0xf4, 0x4a, 0x22, 0x73, + 0x43, 0xf8, 0xa1, 0x0c, 0x07, 0xb5, 0x3a, 0x30, 0x02, 0x1b, 0xdd, 0x90, 0x8c, 0xec, 0x10, 0x07, + 0x43, 0xcf, 0x75, 0x58, 0x98, 0x95, 0x1d, 0x69, 0x37, 0x5f, 0xa9, 0xcf, 0x67, 0xc5, 0xfc, 0x9d, + 0x90, 0x8c, 0x50, 0xfc, 0x0b, 0x0f, 0xf6, 0xe9, 0xd5, 0x82, 0x25, 0x9e, 0x28, 0xdf, 0x5d, 0x02, + 0xea, 0xc0, 0x11, 0xc8, 0x53, 0xb2, 0x1c, 0x72, 0x95, 0x87, 0xb4, 0xe6, 0xb3, 0x62, 0xae, 0x45, + 0xfe, 0x89, 0x80, 0x39, 0x4a, 0xce, 0xc3, 0x41, 0x20, 0x53, 0x1c, 0x8e, 0x34, 0x99, 0xd5, 0x0f, + 0x71, 0x19, 0x6e, 0x81, 0xb4, 0x4b, 0x46, 0x23, 0x8f, 0x6a, 0x6b, 0x5c, 0x2b, 0x4e, 0x50, 0x03, + 0x99, 0xfb, 0x63, 0x0f, 0x47, 0x2e, 0xd6, 0xd2, 0x3b, 0xd2, 0xae, 0x82, 0x92, 0x23, 0x7c, 0x00, + 0xae, 0x0f, 0x9d, 0x5e, 0xcf, 0xf3, 0x7b, 0x76, 0x97, 0x0c, 0x87, 0xe4, 0x7b, 0x1c, 0x46, 0x36, + 0xf1, 0xed, 0xc4, 0x5c, 0xd9, 0x59, 0xdd, 0xcd, 0xed, 0xdd, 0x32, 0x9e, 0xcb, 0x48, 0x63, 0x41, + 0xa1, 0x73, 0x5a, 0x19, 0x87, 0x42, 0xac, 0xc8, 0x67, 0xb3, 0x62, 0x0a, 0xbd, 0x25, 0xe0, 0xef, + 0x24, 0xe8, 0xc7, 0xfe, 0x3d, 0x11, 0xbb, 0x01, 0xde, 0x7d, 0x51, 0x6c, 0xdb, 0x71, 0xdd, 0x71, + 0xe8, 0x50, 0xac, 0x01, 0x7e, 0xe7, 0x77, 0x2e, 0x45, 0x2a, 0x0b, 0xc3, 0x03, 0x59, 0xc9, 0xa8, + 0x8a, 0xfe, 0x4b, 0x1a, 0x40, 0xc6, 0xb7, 0xbb, 0x38, 0x8a, 0x9c, 0x1e, 0x46, 0xf8, 0xfe, 0x18, + 0x47, 0xff, 0x3e, 0xe9, 0xee, 0x82, 0xf5, 0x65, 0xd2, 0x71, 0xc6, 0xe5, 0xf6, 0x6e, 0x2c, 0x95, + 0xee, 0x42, 0x47, 0x6b, 0x38, 0x72, 0x43, 0x2f, 0xa0, 0x24, 0x14, 0xb5, 0xca, 0x2d, 0x11, 0x0a, + 0x5a, 0x00, 0x9c, 0xd3, 0x89, 0x73, 0xe9, 0x6a, 0x60, 0xd9, 0x05, 0x59, 0x60, 0x09, 0x64, 0x46, + 0x71, 0x2d, 0x38, 0x5b, 0x72, 0x7b, 0x1b, 0x46, 0x3c, 0xbb, 0x86, 0x28, 0x91, 0x70, 0x49, 0xac, + 0x96, 0xf9, 0xb2, 0xf6, 0x34, 0x5f, 0x0e, 0x00, 0xe8, 0x27, 0x63, 0x1c, 0x69, 0x69, 0xce, 0x8e, + 0x1b, 0x97, 0xb0, 0xe3, 0xa9, 0x99, 0x17, 0x21, 0x96, 0xbc, 0x61, 0x13, 0x6c, 0x2c, 0x4e, 0x76, + 0x88, 0xa3, 0x20, 0xd2, 0x32, 0x57, 0x06, 0xbc, 0xb6, 0x80, 0x40, 0x0c, 0x01, 0x7e, 0x0b, 0x36, + 0xe2, 0x2e, 0x47, 0xd4, 0x09, 0xa9, 0x3d, 0xc0, 0x53, 0x4d, 0xd9, 0x91, 0x76, 0xd7, 0x2b, 0x9f, + 0x3c, 0x99, 0x15, 0x6f, 0x5e, 0xad, 0xc3, 0x75, 0x3c, 0x45, 0x79, 0x8e, 0xd6, 0x64, 0x60, 0x75, + 0x3c, 0x7d, 0xe9, 0xbc, 0x64, 0xff, 0x4b, 0xf3, 0xa2, 0x77, 0xc1, 0x9b, 0xcf, 0x0e, 0x4a, 0xc5, + 0xa1, 0x6e, 0x1f, 0xd6, 0x81, 0x12, 0xc6, 0xe7, 0x48, 0x93, 0x78, 0x52, 0xef, 0xbf, 0xa0, 0x2b, + 0x17, 0x10, 0xe2, 0x54, 0x16, 0x00, 0x7a, 0x03, 0x68, 0x4f, 0x59, 0x45, 0x01, 0xf1, 0x23, 0x7c, 0xe2, 0x7b, 0xc4, 0x87, 0x06, 0x58, 0xe3, 0xef, 0x19, 0x9f, 0xc9, 0xdc, 0x9e, 0x76, 0x31, 0x4a, - 0xd0, 0x36, 0x4c, 0xf6, 0x3b, 0x8a, 0xcd, 0x3e, 0x97, 0xcf, 0x7e, 0x29, 0x4a, 0xfa, 0xef, 0x2b, - 0xe0, 0xfa, 0x73, 0x20, 0xff, 0xc7, 0x43, 0x5e, 0x07, 0x6b, 0x63, 0x56, 0x50, 0x31, 0xe2, 0xa5, - 0x57, 0xe9, 0xd6, 0x52, 0x1f, 0x04, 0x60, 0x8c, 0xa1, 0xcf, 0xd2, 0x60, 0xa3, 0xe9, 0x3b, 0x41, - 0xd4, 0x27, 0x34, 0xd9, 0x9f, 0x26, 0x48, 0xf7, 0xb1, 0xd3, 0xc1, 0x49, 0xa7, 0x3e, 0xba, 0x24, - 0xc2, 0x05, 0x3f, 0x63, 0x9f, 0x3b, 0x21, 0xe1, 0x0c, 0xdf, 0x05, 0xca, 0x60, 0x62, 0xb7, 0x19, - 0xc9, 0x78, 0xf5, 0xd6, 0x2b, 0x39, 0xd6, 0xa1, 0xfa, 0x29, 0xe7, 0x1d, 0xca, 0x0c, 0x26, 0x31, - 0x01, 0x8b, 0x20, 0x37, 0x24, 0x3d, 0x1b, 0xfb, 0x34, 0xf4, 0x70, 0xa4, 0xad, 0xee, 0xac, 0xee, - 0xae, 0x23, 0x30, 0x24, 0x3d, 0x33, 0xd6, 0xc0, 0x4d, 0xb0, 0xd6, 0xf5, 0x7c, 0x67, 0xc8, 0x13, - 0x56, 0x50, 0x7c, 0xd8, 0xfe, 0x59, 0x06, 0xe9, 0x38, 0x22, 0x74, 0xc0, 0x26, 0x5b, 0x73, 0xb6, - 0xd8, 0x6a, 0xb6, 0xa0, 0xa3, 0xe8, 0xd9, 0x95, 0xe9, 0x0c, 0xc3, 0x67, 0xdf, 0x94, 0xb7, 0x00, - 0x10, 0xdb, 0xc6, 0x7b, 0x80, 0x79, 0xff, 0x56, 0x51, 0x36, 0xde, 0x18, 0xde, 0x03, 0xcc, 0x72, - 0x70, 0x1d, 0xdf, 0xee, 0x60, 0x77, 0xe8, 0xf9, 0x58, 0x5c, 0x14, 0xb8, 0x8e, 0x5f, 0x8b, 0x35, - 0xd0, 0x02, 0x6b, 0xfc, 0xfb, 0x8a, 0xaf, 0xd9, 0xcb, 0x4b, 0x1a, 0x51, 0x12, 0x3a, 0x3d, 0x7c, - 0x4e, 0x84, 0x26, 0x73, 0x4a, 0x5a, 0xc6, 0x11, 0xd8, 0xc0, 0x06, 0xa1, 0x47, 0x42, 0x8f, 0x4e, - 0xf9, 0x23, 0x7f, 0xed, 0x52, 0x0a, 0x5c, 0x6c, 0xd0, 0xb1, 0x70, 0x43, 0x0b, 0x00, 0x06, 0x16, - 0x51, 0xb6, 0x22, 0x7a, 0x53, 0x2d, 0x73, 0x25, 0xb0, 0xa6, 0x70, 0x43, 0x0b, 0x00, 0xf8, 0x25, - 0xb8, 0x31, 0xf6, 0x05, 0xc7, 0x29, 0xee, 0xd8, 0x34, 0x1c, 0xfb, 0xb1, 0x14, 0xe7, 0xae, 0xf0, - 0xb2, 0x6c, 0x2f, 0xdb, 0xb4, 0x12, 0x13, 0x9e, 0x28, 0xfc, 0x02, 0xc8, 0x74, 0x1a, 0xb0, 0xed, - 0xca, 0xae, 0xf2, 0xc1, 0x2b, 0x5e, 0xa5, 0x35, 0x0d, 0x30, 0xe2, 0x8e, 0x07, 0xb2, 0x22, 0xa9, - 0x2b, 0xfa, 0x6d, 0xa0, 0x24, 0xb9, 0xc2, 0x1c, 0xc8, 0x9c, 0x34, 0xea, 0x8d, 0xa3, 0xaf, 0x1a, - 0x6a, 0x0a, 0xae, 0x03, 0x05, 0x99, 0xd5, 0xa3, 0x53, 0x13, 0x7d, 0xad, 0x4a, 0x30, 0x0f, 0xb2, - 0xc8, 0xac, 0x94, 0x0f, 0xcb, 0x8d, 0xaa, 0xa9, 0xae, 0xe8, 0x1a, 0x50, 0x92, 0xa4, 0x98, 0x61, - 0xfd, 0xd4, 0xae, 0x94, 0x5b, 0xd5, 0x7d, 0x35, 0xa5, 0x7f, 0x06, 0x64, 0x16, 0x03, 0x6e, 0x01, - 0x78, 0x6a, 0x95, 0xed, 0x66, 0xa3, 0x7c, 0xdc, 0xdc, 0x3f, 0x6a, 0xd9, 0xf7, 0x4e, 0xcc, 0x13, - 0x53, 0x4d, 0xc1, 0xeb, 0x60, 0xe3, 0xd0, 0x2c, 0xa3, 0x86, 0x89, 0x6c, 0xab, 0x61, 0xb5, 0xac, - 0xf2, 0xa1, 0x2a, 0xe9, 0xb2, 0xb2, 0xa2, 0xae, 0xe8, 0xbf, 0x49, 0x40, 0x3d, 0xbf, 0xaf, 0x58, - 0x5e, 0x77, 0x40, 0x9a, 0x55, 0x64, 0x1c, 0xf1, 0x09, 0xbb, 0xb6, 0x67, 0xbc, 0x34, 0xd1, 0xd8, - 0xd1, 0x68, 0x72, 0x2f, 0x24, 0xbc, 0xd9, 0xf3, 0x9d, 0xbc, 0xf7, 0x8c, 0xeb, 0xd9, 0xc5, 0xc3, - 0xae, 0x5b, 0x20, 0x1d, 0xdb, 0x3e, 0x93, 0x7f, 0xb9, 0x5a, 0x35, 0x8f, 0x5b, 0x66, 0x4d, 0x95, - 0xd8, 0x4f, 0xe5, 0xe3, 0xe3, 0x43, 0xcb, 0xac, 0xa9, 0x2b, 0x30, 0x0b, 0xd6, 0x4c, 0x84, 0x8e, - 0x90, 0xba, 0xca, 0xac, 0x6a, 0x66, 0xf5, 0xd0, 0x6a, 0x98, 0x35, 0x55, 0x3e, 0x90, 0x95, 0x55, - 0x55, 0xd6, 0xbf, 0x05, 0xaf, 0x55, 0x89, 0xdf, 0xad, 0xf6, 0x19, 0xe7, 0xab, 0xc4, 0xa7, 0xf8, - 0x47, 0x0a, 0x3f, 0x04, 0x80, 0x7d, 0x78, 0x3a, 0x7e, 0x27, 0x59, 0xc3, 0xd9, 0x4a, 0x7e, 0x3e, - 0x2b, 0x66, 0xab, 0xb1, 0xd6, 0xaa, 0xa1, 0xac, 0x30, 0xb0, 0x3a, 0xec, 0xb6, 0x81, 0x33, 0x1d, - 0x12, 0x27, 0xfe, 0x48, 0x5f, 0x47, 0xc9, 0xb1, 0xf2, 0xfe, 0xd9, 0x9f, 0x85, 0xd4, 0xd9, 0xbc, - 0x20, 0x3d, 0x9c, 0x17, 0xa4, 0x47, 0xf3, 0x82, 0xf4, 0xc7, 0xbc, 0x20, 0xfd, 0xf4, 0xb8, 0x90, - 0x7a, 0xf8, 0xb8, 0x90, 0x7a, 0xf4, 0xb8, 0x90, 0xfa, 0x46, 0x49, 0x6a, 0xd2, 0x4e, 0xf3, 0xff, - 0x35, 0x6e, 0xfd, 0x1d, 0x00, 0x00, 0xff, 0xff, 0x34, 0x3e, 0x1f, 0x22, 0x54, 0x0d, 0x00, 0x00, + 0xd0, 0x36, 0x4c, 0xf6, 0x3b, 0x8a, 0xcd, 0x3e, 0x93, 0xcf, 0x7e, 0x2e, 0x4a, 0xfa, 0xef, 0x2b, + 0xe0, 0xf5, 0xe7, 0x40, 0xfe, 0x8f, 0x87, 0xbc, 0x0e, 0xd6, 0xc6, 0xac, 0xa0, 0x62, 0xc4, 0x4b, + 0xaf, 0xd2, 0xad, 0xa5, 0x3e, 0x08, 0xc0, 0x18, 0x43, 0xff, 0x2d, 0x0d, 0x36, 0x9a, 0xbe, 0x13, + 0x44, 0x7d, 0x42, 0x93, 0xfd, 0x69, 0x82, 0x74, 0x1f, 0x3b, 0x1d, 0x9c, 0x74, 0xea, 0xe3, 0x4b, + 0x22, 0x5c, 0xf0, 0x33, 0xf6, 0xb9, 0x13, 0x12, 0xce, 0xf0, 0x3d, 0xa0, 0x0c, 0x26, 0x76, 0x9b, + 0x91, 0x8c, 0x57, 0x6f, 0xbd, 0x92, 0x63, 0x1d, 0xaa, 0x9f, 0x72, 0xde, 0xa1, 0xcc, 0x60, 0x12, + 0x13, 0xb0, 0x08, 0x72, 0x43, 0xd2, 0xb3, 0xb1, 0x4f, 0x43, 0x0f, 0x47, 0xda, 0xea, 0xce, 0xea, + 0xee, 0x3a, 0x02, 0x43, 0xd2, 0x33, 0x63, 0x0d, 0xdc, 0x04, 0x6b, 0x5d, 0xcf, 0x77, 0x86, 0x3c, + 0x61, 0x05, 0xc5, 0x87, 0xed, 0x9f, 0x64, 0x90, 0x8e, 0x23, 0x42, 0x07, 0x6c, 0xb2, 0x35, 0x67, + 0x8b, 0xad, 0x66, 0x0b, 0x3a, 0x8a, 0x9e, 0x5d, 0x99, 0xce, 0x30, 0x7c, 0xf6, 0x4d, 0x79, 0x1b, + 0x00, 0xb1, 0x6d, 0xbc, 0x07, 0x98, 0xf7, 0x6f, 0x15, 0x65, 0xe3, 0x8d, 0xe1, 0x3d, 0xc0, 0x2c, + 0x07, 0xd7, 0xf1, 0xed, 0x0e, 0x76, 0x87, 0x9e, 0x8f, 0xc5, 0x45, 0x81, 0xeb, 0xf8, 0xb5, 0x58, + 0x03, 0x2d, 0xb0, 0xc6, 0xbf, 0xaf, 0xf8, 0x9a, 0xbd, 0xbc, 0xa4, 0x11, 0x25, 0xa1, 0xd3, 0xc3, + 0xe7, 0x44, 0x68, 0x32, 0xa7, 0xa4, 0x65, 0x1c, 0x81, 0x0d, 0x6c, 0x10, 0x7a, 0x24, 0xf4, 0xe8, + 0x94, 0x3f, 0xf2, 0xd7, 0x2e, 0xa5, 0xc0, 0xc5, 0x06, 0x35, 0x84, 0x1b, 0x5a, 0x00, 0x30, 0xb0, + 0x88, 0xb2, 0x15, 0xd1, 0x9b, 0x6a, 0x99, 0x2b, 0x81, 0x35, 0x85, 0x1b, 0x5a, 0x00, 0xc0, 0x2f, + 0xc0, 0xf5, 0xb1, 0x2f, 0x38, 0x4e, 0x71, 0xc7, 0xa6, 0xe1, 0xd8, 0x8f, 0xa5, 0x38, 0x77, 0x85, + 0x97, 0x65, 0x7b, 0xd9, 0xa6, 0x95, 0x98, 0xf0, 0x44, 0xe1, 0xe7, 0x40, 0xa6, 0xd3, 0x80, 0x6d, + 0x57, 0x76, 0x95, 0x0f, 0x5f, 0xf1, 0x2a, 0xad, 0x69, 0x80, 0x11, 0x77, 0x3c, 0x90, 0x15, 0x49, + 0x5d, 0xd1, 0x6f, 0x03, 0x25, 0xc9, 0x15, 0xe6, 0x40, 0xe6, 0xe4, 0xa8, 0x7e, 0x74, 0xfc, 0xe5, + 0x91, 0x9a, 0x82, 0xeb, 0x40, 0x41, 0x66, 0xf5, 0xf8, 0xd4, 0x44, 0x5f, 0xa9, 0x12, 0xcc, 0x83, + 0x2c, 0x32, 0x2b, 0xe5, 0xc3, 0xf2, 0x51, 0xd5, 0x54, 0x57, 0x74, 0x0d, 0x28, 0x49, 0x52, 0xcc, + 0xb0, 0x7e, 0x6a, 0x57, 0xca, 0xad, 0xea, 0xbe, 0x9a, 0xd2, 0x6f, 0x02, 0x99, 0xc5, 0x80, 0x5b, + 0x00, 0x9e, 0x5a, 0x65, 0xbb, 0x79, 0x54, 0x6e, 0x34, 0xf7, 0x8f, 0x5b, 0xf6, 0xbd, 0x13, 0xf3, + 0xc4, 0x54, 0x53, 0x2c, 0x86, 0x75, 0x64, 0xb5, 0xac, 0xf2, 0xa1, 0x2a, 0xe9, 0xb2, 0xb2, 0xa2, + 0xae, 0xe8, 0xbf, 0x4a, 0x40, 0x3d, 0xbf, 0xa7, 0x58, 0x5a, 0x77, 0x40, 0x9a, 0x55, 0x62, 0x1c, + 0xf1, 0xc9, 0xba, 0xb6, 0x67, 0xbc, 0x34, 0xc1, 0xd8, 0xd1, 0x68, 0x72, 0x2f, 0x24, 0xbc, 0xd9, + 0xb3, 0x9d, 0xbc, 0xf3, 0x8c, 0xe3, 0xd9, 0xc5, 0x83, 0xae, 0x5b, 0x20, 0x1d, 0xdb, 0x3e, 0x93, + 0x77, 0xb9, 0x5a, 0x35, 0x1b, 0x2d, 0xb3, 0xa6, 0x4a, 0xec, 0xa7, 0x72, 0xa3, 0x71, 0x68, 0x99, + 0x35, 0x75, 0x05, 0x66, 0xc1, 0x9a, 0x89, 0xd0, 0x31, 0x52, 0x57, 0x99, 0x55, 0xcd, 0xac, 0x1e, + 0x5a, 0x47, 0x66, 0x4d, 0x95, 0x0f, 0x64, 0x65, 0x55, 0x95, 0xf5, 0x6f, 0xc0, 0x6b, 0x55, 0xe2, + 0x77, 0xab, 0x7d, 0xc6, 0xf5, 0x2a, 0xf1, 0x29, 0xfe, 0x81, 0xc2, 0x8f, 0x00, 0x60, 0x1f, 0x9c, + 0x8e, 0xdf, 0x49, 0xd6, 0x6f, 0xb6, 0x92, 0x9f, 0xcf, 0x8a, 0xd9, 0x6a, 0xac, 0xb5, 0x6a, 0x28, + 0x2b, 0x0c, 0xac, 0x0e, 0xbb, 0x6d, 0xe0, 0x4c, 0x87, 0xc4, 0x89, 0x3f, 0xce, 0xd7, 0x51, 0x72, + 0xac, 0x7c, 0x70, 0xf6, 0x67, 0x21, 0x75, 0x36, 0x2f, 0x48, 0x0f, 0xe7, 0x05, 0xe9, 0xd1, 0xbc, + 0x20, 0xfd, 0x31, 0x2f, 0x48, 0x3f, 0x3e, 0x2e, 0xa4, 0x1e, 0x3e, 0x2e, 0xa4, 0x1e, 0x3d, 0x2e, + 0xa4, 0xbe, 0x56, 0x92, 0x9a, 0xb4, 0xd3, 0xfc, 0x7f, 0x8c, 0x5b, 0x7f, 0x07, 0x00, 0x00, 0xff, + 0xff, 0x99, 0x91, 0x5c, 0xdb, 0x4c, 0x0d, 0x00, 0x00, } diff --git a/pkg/kv/kvserver/raft.proto b/pkg/kv/kvserver/raft.proto index 0abb9a1477a3..d525efe43b0d 100644 --- a/pkg/kv/kvserver/raft.proto +++ b/pkg/kv/kvserver/raft.proto @@ -141,14 +141,14 @@ message SnapshotRequest { // applySnapshot in replica_raftstorage.go. enum Type { // VIA_SNAPSHOT_QUEUE indicates the snapshots sent by the raft snapshot - // queue. + // queue to all types of replicas. VIA_SNAPSHOT_QUEUE = 0; - // LEARNER_INITIAL indicates the initial snapshots sent to LEARNER replicas - // for upreplication, before they're promoted to full voters. + // INITIAL indicates the initial snapshots sent to LEARNER (before they're + // promoted to full voters) and NON_VOTER replicas for upreplication. // // As of the time of writing, we only send this snapshot from the - // execReplicationChangesForVoters after creating a new LEARNER replica. - LEARNER_INITIAL = 1; + // initializeRaftLearners after creating a new LEARNER or NON_VOTER replica. + INITIAL = 1; reserved 2; } diff --git a/pkg/kv/kvserver/raft_log_queue.go b/pkg/kv/kvserver/raft_log_queue.go index 71d1cb7fb50f..f7f725ff4f0b 100644 --- a/pkg/kv/kvserver/raft_log_queue.go +++ b/pkg/kv/kvserver/raft_log_queue.go @@ -46,11 +46,13 @@ const ( // Allow a limited number of Raft log truncations to be processed // concurrently. raftLogQueueConcurrency = 4 - // While a snapshot is in flight, we won't truncate past the snapshot's log - // index. This behavior is extended to a grace period after the snapshot is - // marked as completed as it is applied at the receiver only a little later, - // leaving a window for a truncation that requires another snapshot. - raftLogQueuePendingSnapshotGracePeriod = 3 * time.Second + // RaftLogQueuePendingSnapshotGracePeriod indicates the grace period after an + // in-flight snapshot is marked completed. While a snapshot is in-flight we + // will not truncate past the snapshot's log index but we also don't want to + // do so the moment the in-flight snapshot completes, since it is only applied + // at the receiver a little later. This grace period reduces the probability + // of an ill-timed log truncation that would necessitate another snapshot. + RaftLogQueuePendingSnapshotGracePeriod = 3 * time.Second ) // raftLogQueue manages a queue of replicas slated to have their raft logs diff --git a/pkg/kv/kvserver/raft_log_queue_test.go b/pkg/kv/kvserver/raft_log_queue_test.go index c7eea225b484..b2e73f860497 100644 --- a/pkg/kv/kvserver/raft_log_queue_test.go +++ b/pkg/kv/kvserver/raft_log_queue_test.go @@ -683,17 +683,17 @@ func TestSnapshotLogTruncationConstraints(t *testing.T) { r.completeSnapshotLogTruncationConstraint(ctx, id1, now) // The index should show up when its deadline isn't hit. assertMin(index1, now) - assertMin(index1, now.Add(raftLogQueuePendingSnapshotGracePeriod)) - assertMin(index1, now.Add(raftLogQueuePendingSnapshotGracePeriod)) + assertMin(index1, now.Add(RaftLogQueuePendingSnapshotGracePeriod)) + assertMin(index1, now.Add(RaftLogQueuePendingSnapshotGracePeriod)) // Once we're over deadline, the index returned so far disappears. - assertMin(index2, now.Add(raftLogQueuePendingSnapshotGracePeriod+1)) + assertMin(index2, now.Add(RaftLogQueuePendingSnapshotGracePeriod+1)) assertMin(index2, time.Time{}) - assertMin(index2, now.Add(10*raftLogQueuePendingSnapshotGracePeriod)) + assertMin(index2, now.Add(10*RaftLogQueuePendingSnapshotGracePeriod)) r.completeSnapshotLogTruncationConstraint(ctx, id2, now) assertMin(index2, now) - assertMin(index2, now.Add(raftLogQueuePendingSnapshotGracePeriod)) - assertMin(0, now.Add(2*raftLogQueuePendingSnapshotGracePeriod)) + assertMin(index2, now.Add(RaftLogQueuePendingSnapshotGracePeriod)) + assertMin(0, now.Add(2*RaftLogQueuePendingSnapshotGracePeriod)) assert.Equal(t, r.mu.snapshotLogTruncationConstraints, map[uuid.UUID]snapTruncationInfo(nil)) } diff --git a/pkg/kv/kvserver/raft_snapshot_queue.go b/pkg/kv/kvserver/raft_snapshot_queue.go index f05f12e27b14..3dd6bfe6e918 100644 --- a/pkg/kv/kvserver/raft_snapshot_queue.go +++ b/pkg/kv/kvserver/raft_snapshot_queue.go @@ -110,20 +110,27 @@ func (rq *raftSnapshotQueue) processRaftSnapshot( } snapType := SnapshotRequest_VIA_SNAPSHOT_QUEUE - if repDesc.GetType() == roachpb.LEARNER { - if fn := repl.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() { + if typ := repDesc.GetType(); typ == roachpb.LEARNER || typ == roachpb.NON_VOTER { + if fn := repl.store.cfg.TestingKnobs.RaftSnapshotQueueSkipReplica; fn != nil && fn() { return nil } if index := repl.getAndGCSnapshotLogTruncationConstraints( timeutil.Now(), repDesc.StoreID, ); index > 0 { - // There is a snapshot being transferred. It's probably a LEARNER snap, so - // bail for now and try again later. + // There is a snapshot being transferred. It's probably an INITIAL snap, + // so bail for now and try again later. err := errors.Errorf( - "skipping snapshot; replica is likely a learner in the process of being added: %s", repDesc) + "skipping snapshot; replica is likely a %s in the process of being added: %s", + typ, + repDesc, + ) // TODO(knz): print the error instead when the error package // knows how to expose redactable strings. - log.Infof(ctx, "skipping snapshot; replica is likely a learner in the process of being added: %s", repDesc) + log.Infof(ctx, + "skipping snapshot; replica is likely a %s in the process of being added: %s", + typ, + repDesc, + ) // TODO(dan): This is super brittle and non-obvious. In the common case, // this check avoids duplicate work, but in rare cases, we send the // learner snap at an index before the one raft wanted here. The raft @@ -135,7 +142,7 @@ func (rq *raftSnapshotQueue) processRaftSnapshot( // sufficient, this message will be ignored, but if we hit the case // described above, this will cause raft to keep asking for a snap and at // some point the snapshot lock above will be released and we'll fall - // through to the below. + // through to the logic below. repl.reportSnapshotStatus(ctx, repDesc.ReplicaID, err) return nil } diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index e9af13044212..b61b81be9f1a 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -603,16 +603,6 @@ func (r *Replica) AdminMerge( return err } - // Ensure that every current replica of the LHS has been initialized. - // Otherwise there is a rare race where the replica GC queue can GC a - // replica of the RHS too early. The comment on - // TestStoreRangeMergeUninitializedLHSFollower explains the situation in full. - if err := waitForReplicasInit( - ctx, r.store.cfg.NodeDialer, origLeftDesc.RangeID, origLeftDesc.Replicas().Descriptors(), - ); err != nil { - return errors.Wrap(err, "waiting for all left-hand replicas to initialize") - } - // Do a consistent read of the right hand side's range descriptor. // Again, use a locking read because we intend to update this key // shortly. @@ -655,6 +645,29 @@ func (r *Replica) AdminMerge( if !replicasCollocated(lReplicas.Descriptors(), rReplicas.Descriptors()) { return errors.Errorf("ranges not collocated; %s != %s", lReplicas, rReplicas) } + + // Ensure that every current replica of the LHS has been initialized. + // Otherwise there is a rare race where the replica GC queue can GC a + // replica of the RHS too early. The comment on + // TestStoreRangeMergeUninitializedLHSFollower explains the situation in full. + if err := waitForReplicasInit( + ctx, r.store.cfg.NodeDialer, origLeftDesc.RangeID, origLeftDesc.Replicas().Descriptors(), + ); err != nil { + return errors.Wrap(err, "waiting for all left-hand replicas to initialize") + } + // Out of an abundance of caution, also ensure that replicas of the RHS have + // all been initialized. If for whatever reason the initial upreplication + // snapshot for a NON_VOTER on the RHS fails, it will have to get picked up + // by the raft snapshot queue to upreplicate and may be uninitialized at + // this point. As such, if we send a subsume request to the RHS in this sort + // of state, we will wastefully and unintentionally block all traffic on it + // for 5 seconds. + if err := waitForReplicasInit( + ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, rightDesc.Replicas().Descriptors(), + ); err != nil { + return errors.Wrap(err, "waiting for all right-hand replicas to initialize") + } + mergeReplicas := lReplicas.Descriptors() updatedLeftDesc := *origLeftDesc @@ -1019,39 +1032,33 @@ func (r *Replica) changeReplicasImpl( } if adds := targets.voterAdditions; len(adds) > 0 { - // Lock learner snapshots even before we run the ConfChange txn to add them - // to prevent a race with the raft snapshot queue trying to send it first. - // Note that this lock needs to cover sending the snapshots which happens in - _ = r.execReplicationChangesForVoters - // which also has some more details on what's going on here. - // - // Also note that the lock only prevents the raft snapshot queue from - // sending snapshots to learner replicas, it will still send them to voters. - // There are more details about this locking in - _ = (*raftSnapshotQueue)(nil).processRaftSnapshot - // as well as a TODO about fixing all this to be less subtle and brittle. - releaseSnapshotLockFn := r.lockLearnerSnapshot(ctx, adds) - defer releaseSnapshotLockFn() - - // For all newly added nodes, first add raft learner replicas. They accept raft traffic - // (so they can catch up) but don't get to vote (so they don't affect quorum and thus - // don't introduce fragility into the system). For details see: + // For all newly added voters, first add LEARNER replicas. They accept raft + // traffic (so they can catch up) but don't get to vote (so they don't + // affect quorum and thus don't introduce fragility into the system). For + // details see: _ = roachpb.ReplicaSet.LearnerDescriptors var err error - desc, err = addRaftLearners(ctx, r.store, desc, reason, details, adds, internalChangeTypeAddLearner) + desc, err = r.initializeRaftLearners( + ctx, desc, priority, reason, details, adds, roachpb.LEARNER, + ) if err != nil { return nil, err } + + if fn := r.store.cfg.TestingKnobs.VoterAddStopAfterLearnerSnapshot; fn != nil && fn(adds) { + return desc, nil + } } if len(targets.voterAdditions)+len(targets.voterRemovals) > 0 { - // Catch up any learners, then run the atomic replication change that adds the - // final voters and removes any undesirable replicas. - desc, err = r.execReplicationChangesForVoters(ctx, desc, priority, reason, details, - targets.voterAdditions, targets.voterRemovals) + desc, err = r.execReplicationChangesForVoters( + ctx, desc, reason, details, + targets.voterAdditions, targets.voterRemovals, + ) if err != nil { - // If the error occurred while transitioning out of an atomic replication change, - // try again here with a fresh descriptor; this is a noop otherwise. + // If the error occurred while transitioning out of an atomic replication + // change, try again here with a fresh descriptor; this is a noop + // otherwise. if _, err := maybeLeaveAtomicChangeReplicas(ctx, r.store, r.Desc()); err != nil { return nil, err } @@ -1063,7 +1070,7 @@ func (r *Replica) changeReplicasImpl( if adds := targets.voterAdditions; len(adds) > 0 { log.Infof(ctx, "could not promote %v to voter, rolling back: %v", adds, err) for _, target := range adds { - r.tryRollBackLearnerReplica(ctx, r.Desc(), target, reason, details) + r.tryRollbackRaftLearner(ctx, r.Desc(), target, reason, details) } } return nil, err @@ -1071,14 +1078,22 @@ func (r *Replica) changeReplicasImpl( } if adds := targets.nonVoterAdditions; len(adds) > 0 { - desc, err = addRaftLearners(ctx, r.store, desc, reason, details, adds, internalChangeTypeAddNonVoter) + // Add all non-voters and send them initial snapshots since some callers of + // `AdminChangeReplicas` (notably the mergeQueue, via `AdminRelocateRange`) + // care about only dealing with replicas that are mostly caught up with the + // raft leader. Not attempting to upreplicate these non-voters here can lead + // to spurious interactions that can fail range merges and cause severe + // disruption to foreground traffic. See + // https://github.com/cockroachdb/cockroach/issues/63199 for an example. + desc, err = r.initializeRaftLearners( + ctx, desc, priority, reason, details, adds, roachpb.NON_VOTER, + ) if err != nil { return nil, err } - // Queue the replica up into the raft snapshot queue so that the non-voters - // that were added receive their first snapshot relatively soon. See the - // comment block above ReplicaSet.NonVoters() for why we do this. - r.store.raftSnapshotQueue.AddAsync(ctx, r, raftSnapshotPriority) + if fn := r.store.TestingKnobs().NonVoterAfterInitialization; fn != nil { + fn() + } } if removals := targets.nonVoterRemovals; len(removals) > 0 { @@ -1498,16 +1513,76 @@ func getChangesByNodeID(chgs roachpb.ReplicationChanges) changesByNodeID { return chgsByNodeID } -// addRaftLearners adds etcd/raft learners to the given replication targets. -func addRaftLearners( +// initializeRaftLearners adds etcd LearnerNodes (LEARNERs or NON_VOTERs in +// Cockroach-land) to the given replication targets and synchronously sends them +// an initial snapshot to upreplicate. Once this successfully returns, the +// callers can assume that the learners were added and have been initialized via +// that snapshot. Otherwise, if we get any errors trying to add or upreplicate +// any of these learners, this function will clean up after itself by rolling all +// of them back. +func (r *Replica) initializeRaftLearners( ctx context.Context, - s *Store, desc *roachpb.RangeDescriptor, + priority SnapshotRequest_Priority, reason kvserverpb.RangeLogEventReason, details string, targets []roachpb.ReplicationTarget, - typ internalChangeType, -) (*roachpb.RangeDescriptor, error) { + replicaType roachpb.ReplicaType, +) (afterDesc *roachpb.RangeDescriptor, err error) { + var iChangeType internalChangeType + switch replicaType { + case roachpb.LEARNER: + iChangeType = internalChangeTypeAddLearner + case roachpb.NON_VOTER: + iChangeType = internalChangeTypeAddNonVoter + default: + log.Fatalf(ctx, "unexpected replicaType %s", replicaType) + } + // Lock learner snapshots even before we run the ConfChange txn to add them + // to prevent a race with the raft snapshot queue trying to send it first. + // + // Also note that the lock only prevents the raft snapshot queue from sending + // snapshots to learners and non-voters, it will still send them to voters. + // There are more details about this locking in + _ = (*raftSnapshotQueue)(nil).processRaftSnapshot + // as well as a TODO about fixing all this to be less subtle and brittle. + releaseSnapshotLockFn := r.lockLearnerSnapshot(ctx, targets) + defer releaseSnapshotLockFn() + + // If we fail to add or send a snapshot to the learners we're adding, roll + // them all back. + defer func() { + if err != nil { + log.Infof( + ctx, + "could not successfully add and upreplicate %s replica(s) on %s, rolling back: %v", + replicaType, + targets, + err, + ) + if fn := r.store.cfg.TestingKnobs.ReplicaAddSkipLearnerRollback; fn != nil && fn() { + return + } + // TODO(aayush): We could probably do better here by just rolling back the + // learners for which the initial snapshot failed. However: + // + // - As of the time of this writing, `AdminChangeReplicas` is only called + // for one change at a time by its callers so it doesn't seem worth it to + // befog its contract. + // - If, in the future we start executing multiple changes in an + // `AdminChangeReplicas` call: in case of voter addition, if we only + // rolled some of the learners back but not the others, its not obvious + // how we could still go ahead with their promotion and it also becomes + // unclear how the error received here should be propagated to the client. + for _, target := range targets { + // NB: We're checking `r.Desc()` from this replica's current applied + // state, not the `desc` we received as a parameter to this call (which + // is stale and won't reflect the work we did in this function). + r.tryRollbackRaftLearner(ctx, r.Desc(), target, reason, details) + } + } + }() + // TODO(tbg): we could add all learners in one go, but then we'd need to // do it as an atomic replication change (raft doesn't know which config // to apply the delta to, so we might be demoting more than one voter). @@ -1515,21 +1590,89 @@ func addRaftLearners( // before returning from this method, and it's unclear that it's worth // doing. for _, target := range targets { - iChgs := []internalReplicationChange{{target: target, typ: typ}} + iChgs := []internalReplicationChange{{target: target, typ: iChangeType}} var err error desc, err = execChangeReplicasTxn( ctx, desc, reason, details, iChgs, changeReplicasTxnArgs{ - db: s.DB(), - liveAndDeadReplicas: s.allocator.storePool.liveAndDeadReplicas, - logChange: s.logChange, - testForceJointConfig: s.TestingKnobs().ReplicationAlwaysUseJointConfig, - testAllowDangerousReplicationChanges: s.TestingKnobs().AllowDangerousReplicationChanges, + db: r.store.DB(), + liveAndDeadReplicas: r.store.allocator.storePool.liveAndDeadReplicas, + logChange: r.store.logChange, + testForceJointConfig: r.store.TestingKnobs().ReplicationAlwaysUseJointConfig, + testAllowDangerousReplicationChanges: r.store.TestingKnobs().AllowDangerousReplicationChanges, }, ) if err != nil { return nil, err } } + + // Wait for our replica to catch up with the descriptor change. The replica is + // expected to usually be already caught up because it's expected to usually + // be the leaseholder - but it doesn't have to be. Being caught up is + // important because we are sending snapshots below to newly-added replicas, + // and those snapshots would be invalid if our stale descriptor doesn't + // contain the respective replicas. + // TODO(andrei): Find a better way to wait for replication. If we knew the + // LAI of the respective command, we could use waitForApplication(). + descriptorOK := false + start := timeutil.Now() + retOpts := retry.Options{InitialBackoff: time.Second, MaxBackoff: time.Second, MaxRetries: 10} + for re := retry.StartWithCtx(ctx, retOpts); re.Next(); { + rDesc := r.Desc() + if rDesc.Generation >= desc.Generation { + descriptorOK = true + break + } + log.VEventf(ctx, 1, "stale descriptor detected; waiting to catch up to replication. want: %s, have: %s", + desc, rDesc) + if _, err := r.IsDestroyed(); err != nil { + return nil, errors.Wrapf( + err, + "replica destroyed while waiting desc replication", + ) + } + } + if !descriptorOK { + return nil, errors.Newf( + "waited for %s and replication hasn't caught up with descriptor update", + timeutil.Since(start), + ) + } + + for _, target := range targets { + rDesc, ok := desc.GetReplicaDescriptor(target.StoreID) + if !ok { + return nil, errors.Errorf("programming error: replica %v not found in %v", target, desc) + } + + if rDesc.GetType() != replicaType { + return nil, errors.Errorf("programming error: cannot promote replica of type %s", rDesc.Type) + } + + if fn := r.store.cfg.TestingKnobs.ReplicaSkipInitialSnapshot; fn != nil && fn() { + continue + } + + // Note that raft snapshot queue will refuse to send a snapshot to a learner + // or non-voter replica if its store is already sending a snapshot to that + // replica. That would race with this snapshot, except that we've put a + // (best effort) lock on it before the conf change txn was run (see call to + // `lockLearnerSnapshot` above). This is best effort because the lock can + // time out and the lock is local to this node, while the raft leader could + // be on another node entirely (they're usually co-located but this is not + // guaranteed). + // + // We originally tried always refusing to send snapshots from the raft + // snapshot queue to learner replicas, but this turned out to be brittle. + // First, if the snapshot failed, any attempt to use the learner's raft + // group would hang until the replicate queue got around to cleaning up the + // orphaned learner. Second, this tickled some bugs in etcd/raft around + // switching between StateSnapshot and StateProbe. Even if we worked through + // these, it would be susceptible to future similar issues. + if err := r.sendSnapshot(ctx, rDesc, SnapshotRequest_INITIAL, priority); err != nil { + return nil, err + } + } return desc, nil } @@ -1562,9 +1705,8 @@ func (r *Replica) lockLearnerSnapshot( // execReplicationChangesForVoters carries out the atomic membership change that // finalizes the addition and/or removal of voting replicas. Any voters in the // process of being added (as reflected by the replication changes) must have -// been added as learners already and will be caught up before being promoted to -// voters. Cluster version permitting, voter removals (from the replication -// changes) will preferably be carried out by first demoting to a learner +// been added as learners already and caught up. Voter removals (from the +// replication changes) will be carried out by first demoting to a learner // instead of outright removal (this avoids a [raft-bug] that can lead to // unavailability). All of this occurs in one atomic raft membership change // which is carried out across two phases. On error, it is possible that the @@ -1584,7 +1726,6 @@ func (r *Replica) lockLearnerSnapshot( func (r *Replica) execReplicationChangesForVoters( ctx context.Context, desc *roachpb.RangeDescriptor, - priority SnapshotRequest_Priority, reason kvserverpb.RangeLogEventReason, details string, voterAdditions, voterRemovals []roachpb.ReplicationTarget, @@ -1593,77 +1734,9 @@ func (r *Replica) execReplicationChangesForVoters( // this may want to detect that and retry, sending a snapshot and promoting // both sides. - // Wait for our replica to catch up with the descriptor change. The replica is - // expected to usually be already caught up because it's expected to usually - // be the leaseholder - but it doesn't have to be. Being caught up is - // important because we might need to send snapshots below to newly-added - // replicas, and those snapshots would be invalid if our stale descriptor - // doesn't contain the respective replicas. - // TODO(andrei): Find a better way to wait for replication. If we knew the - // LAI of the respective command, we could use waitForApplication(). - descriptorOK := false - start := timeutil.Now() - retOpts := retry.Options{InitialBackoff: time.Second, MaxBackoff: time.Second, MaxRetries: 10} - for re := retry.StartWithCtx(ctx, retOpts); re.Next(); { - rDesc := r.Desc() - if rDesc.Generation >= desc.Generation { - descriptorOK = true - break - } - log.VEventf(ctx, 1, "stale descriptor detected; waiting to catch up to replication. want: %s, have: %s", - desc, rDesc) - if _, err := r.IsDestroyed(); err != nil { - return nil, errors.Wrapf(err, "replica destroyed while waiting desc replication") - } - } - if !descriptorOK { - return nil, errors.Newf( - "waited for %s and replication hasn't caught up with descriptor update", timeutil.Since(start)) - } - iChgs := make([]internalReplicationChange, 0, len(voterAdditions)+len(voterRemovals)) - for _, target := range voterAdditions { iChgs = append(iChgs, internalReplicationChange{target: target, typ: internalChangeTypePromoteLearner}) - // All adds must be present as learners right now, and we send them - // snapshots in anticipation of promoting them to voters. - rDesc, ok := desc.GetReplicaDescriptor(target.StoreID) - if !ok { - return nil, errors.Errorf("programming error: replica %v not found in %v", target, desc) - } - - if rDesc.GetType() != roachpb.LEARNER { - return nil, errors.Errorf("programming error: cannot promote replica of type %s", rDesc.Type) - } - - if fn := r.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() { - continue - } - - // Note that raft snapshot queue will refuse to send a snapshot to a learner - // replica if its store is already sending a snapshot to that replica. That - // would race with this snapshot, except that we've put a (best effort) lock - // on it before the conf change txn was run. This is best effort because the - // lock can time out and the lock is local to this node, while the raft - // leader could be on another node entirely (they're usually co-located but - // this is not guaranteed). - // - // We originally tried always refusing to send snapshots from the raft - // snapshot queue to learner replicas, but this turned out to be brittle. - // First, if the snapshot failed, any attempt to use the learner's raft - // group would hang until the replicate queue got around to cleaning up the - // orphaned learner. Second, this tickled some bugs in etcd/raft around - // switching between StateSnapshot and StateProbe. Even if we worked through - // these, it would be susceptible to future similar issues. - if err := r.sendSnapshot(ctx, rDesc, SnapshotRequest_LEARNER_INITIAL, priority); err != nil { - return nil, err - } - } - - if adds := voterAdditions; len(adds) > 0 { - if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterLearnerSnapshot; fn != nil && fn(adds) { - return desc, nil - } } for _, target := range voterRemovals { @@ -1695,19 +1768,20 @@ func (r *Replica) execReplicationChangesForVoters( return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc) } -// tryRollbackLearnerReplica attempts to remove a learner specified by the -// target. If no such learner is found in the descriptor (including when it is a -// voter instead), no action is taken. Otherwise, a single time-limited -// best-effort attempt at removing the learner is made. -func (r *Replica) tryRollBackLearnerReplica( +// tryRollbackRaftLearner attempts to remove a learner specified by the target. +// If no such learner is found in the descriptor (including when it is a voter +// instead), no action is taken. Otherwise, a single time-limited best-effort +// attempt at removing the learner is made. +func (r *Replica) tryRollbackRaftLearner( ctx context.Context, - desc *roachpb.RangeDescriptor, + rangeDesc *roachpb.RangeDescriptor, target roachpb.ReplicationTarget, reason kvserverpb.RangeLogEventReason, details string, ) { - repDesc, ok := desc.GetReplicaDescriptor(target.StoreID) - if !ok || repDesc.GetType() != roachpb.LEARNER { + repDesc, ok := rangeDesc.GetReplicaDescriptor(target.StoreID) + isLearnerOrNonVoter := repDesc.GetType() == roachpb.LEARNER || repDesc.GetType() == roachpb.NON_VOTER + if !ok || !isLearnerOrNonVoter { // There's no learner to roll back. log.Event(ctx, "learner to roll back not found; skipping") return @@ -1721,7 +1795,7 @@ func (r *Replica) tryRollBackLearnerReplica( rollbackFn := func(ctx context.Context) error { _, err := execChangeReplicasTxn( - ctx, desc, reason, details, + ctx, rangeDesc, reason, details, []internalReplicationChange{{target: target, typ: internalChangeTypeRemove}}, changeReplicasTxnArgs{ db: r.store.DB(), @@ -1736,11 +1810,16 @@ func (r *Replica) tryRollBackLearnerReplica( if err := contextutil.RunWithTimeout( rollbackCtx, "learner rollback", rollbackTimeout, rollbackFn, ); err != nil { - log.Infof(ctx, - "failed to rollback learner %s, abandoning it for the replicate queue: %v", target, err) + log.Infof( + ctx, + "failed to rollback %s %s, abandoning it for the replicate queue: %v", + repDesc.GetType(), + target, + err, + ) r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp()) } else { - log.Infof(ctx, "rolled back learner %s in %s", target, desc) + log.Infof(ctx, "rolled back %s %s in %s", repDesc.GetType(), target, rangeDesc) } } diff --git a/pkg/kv/kvserver/replica_learner_test.go b/pkg/kv/kvserver/replica_learner_test.go index b4b752657abd..e6835081cf23 100644 --- a/pkg/kv/kvserver/replica_learner_test.go +++ b/pkg/kv/kvserver/replica_learner_test.go @@ -74,7 +74,7 @@ func (rtl *replicationTestKnobs) withStopAfterJointConfig(f func()) { func makeReplicationTestKnobs() (base.TestingKnobs, *replicationTestKnobs) { var k replicationTestKnobs - k.storeKnobs.ReplicaAddStopAfterLearnerSnapshot = func(_ []roachpb.ReplicationTarget) bool { + k.storeKnobs.VoterAddStopAfterLearnerSnapshot = func(_ []roachpb.ReplicationTarget) bool { return atomic.LoadInt64(&k.replicaAddStopAfterLearnerAtomic) > 0 } k.storeKnobs.VoterAddStopAfterJointConfig = func() bool { @@ -198,17 +198,8 @@ func TestAddRemoveNonVotingReplicasBasic(t *testing.T) { defer log.Scope(t).Close(t) ctx := context.Background() - knobs, ltk := makeReplicationTestKnobs() - blockUntilSnapshotCh := make(chan struct{}) - ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { - if h.Type == kvserver.SnapshotRequest_VIA_SNAPSHOT_QUEUE { - close(blockUntilSnapshotCh) - } - return nil - } tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, - ServerArgs: base.TestServerArgs{Knobs: knobs}, }) defer tc.Stopper().Stop(ctx) @@ -218,16 +209,6 @@ func TestAddRemoveNonVotingReplicasBasic(t *testing.T) { _, err := tc.AddNonVoters(scratchStartKey, tc.Target(1)) return err }) - - // When we create a non-voting replica, we queue up its range leaseholder into - // the raft snapshot queue so that it can receive its LEARNER snapshot and - // upreplicate relatively soon (i.e. without having to wait up to a full scanner - // cycle). - select { - case <-blockUntilSnapshotCh: - case <-time.After(30 * time.Second): - t.Fatal(`test timed out; did not receive snapshot of type VIA_SNAPSHOT_QUEUE as expected`) - } require.NoError(t, g.Wait()) desc := tc.LookupRangeOrFatal(t, scratchStartKey) @@ -328,35 +309,124 @@ func TestLearnerSnapshotFailsRollback(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - var rejectSnapshots int64 - knobs, ltk := makeReplicationTestKnobs() - ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { - if atomic.LoadInt64(&rejectSnapshots) > 0 { - return errors.New(`nope`) + runTest := func(t *testing.T, replicaType roachpb.ReplicaType) { + var rejectSnapshots int64 + knobs, ltk := makeReplicationTestKnobs() + ltk.storeKnobs.ReceiveSnapshot = func(h *kvserver.SnapshotRequest_Header) error { + if atomic.LoadInt64(&rejectSnapshots) > 0 { + return errors.New(`nope`) + } + return nil } - return nil + ctx := context.Background() + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + + scratchStartKey := tc.ScratchRange(t) + atomic.StoreInt64(&rejectSnapshots, 1) + var err error + switch replicaType { + case roachpb.LEARNER: + _, err = tc.AddVoters(scratchStartKey, tc.Target(1)) + case roachpb.NON_VOTER: + _, err = tc.AddNonVoters(scratchStartKey, tc.Target(1)) + default: + log.Fatalf(ctx, "unexpected replicaType: %s", replicaType) + } + + if !testutils.IsError(err, `remote couldn't accept INITIAL snapshot`) { + t.Fatalf(`expected "remote couldn't accept INITIAL snapshot" error got: %+v`, err) + } + // Make sure we cleaned up after ourselves (by removing the learner/non-voter). + desc := tc.LookupRangeOrFatal(t, scratchStartKey) + require.Empty(t, desc.Replicas().LearnerDescriptors()) } - ctx := context.Background() - tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{Knobs: knobs}, - ReplicationMode: base.ReplicationManual, + + t.Run("learner", func(t *testing.T) { + runTest(t, roachpb.LEARNER) }) - defer tc.Stopper().Stop(ctx) + t.Run("non-voter", func(t *testing.T) { + runTest(t, roachpb.NON_VOTER) + }) +} + +// TestNonVoterCatchesUpViaRaftSnapshotQueue ensures that a non-voting replica +// in need of a snapshot will receive one via the raft snapshot queue. This is +// also meant to test that a non-voting replica that is initialized via an +// `INITIAL` snapshot during its addition is not ignored by the raft snapshot +// queue for future snapshots. +func TestNonVoterCatchesUpViaRaftSnapshotQueue(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + skip.UnderShort(t, "this test sleeps for a few seconds") + + var skipInitialSnapshot int64 + knobs, ltk := makeReplicationTestKnobs() + ctx := context.Background() + + // Set it up such that the newly added non-voter will not receive its INITIAL + // snapshot. + ltk.storeKnobs.ReplicaSkipInitialSnapshot = func() bool { + return atomic.LoadInt64(&skipInitialSnapshot) == 1 + } + // Synchronize with the removal of the "best effort" lock on log truncation. + // See (*Replica).lockLearnerSnapshot for details. + nonVoterSnapLockRemoved := make(chan struct{}, 1) + ltk.storeKnobs.NonVoterAfterInitialization = func() { + nonVoterSnapLockRemoved <- struct{}{} + } + // Disable the raft snapshot queue, we will manually queue a replica into it + // below. + ltk.storeKnobs.DisableRaftSnapshotQueue = true + tc := testcluster.StartTestCluster( + t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }, + ) + defer tc.Stopper().Stop(ctx) scratchStartKey := tc.ScratchRange(t) - atomic.StoreInt64(&rejectSnapshots, 1) - _, err := tc.AddVoters(scratchStartKey, tc.Target(1)) - // TODO(dan): It'd be nice if we could cancel the `AddVoters` context before - // returning the error from the `ReceiveSnapshot` knob to test the codepath - // that uses a new context for the rollback, but plumbing that context is - // annoying. - if !testutils.IsError(err, `remote couldn't accept LEARNER_INITIAL snapshot`) { - t.Fatalf(`expected "remote couldn't accept LEARNER_INITIAL snapshot" error got: %+v`, err) + g, ctx := errgroup.WithContext(ctx) + + // Add a new voting replica, but don't initialize it. Note that + // `tc.AddNonVoters` will not return until the newly added non-voter is + // initialized, which we will do below via the snapshot queue. + g.Go(func() error { + atomic.StoreInt64(&skipInitialSnapshot, 1) + _, err := tc.AddNonVoters(scratchStartKey, tc.Target(1)) + return err + }) + + select { + case <-nonVoterSnapLockRemoved: + case <-time.After(testutils.DefaultSucceedsSoonDuration): + t.Fatal("took too long") } - // Make sure we cleaned up after ourselves (by removing the learner). - desc := tc.LookupRangeOrFatal(t, scratchStartKey) - require.Empty(t, desc.Replicas().LearnerDescriptors()) + scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey) + leaseholderStore := tc.GetFirstStoreFromServer(t, 0) + require.NotNil(t, leaseholderStore) + leaseholderRepl, err := leaseholderStore.GetReplica(scratchDesc.RangeID) + require.NoError(t, err) + require.NotNil(t, leaseholderRepl) + + time.Sleep(kvserver.RaftLogQueuePendingSnapshotGracePeriod) + // Manually enqueue the leaseholder replica into its store's raft snapshot + // queue. We expect it to pick up on the fact that the non-voter on its range + // needs a snapshot. + recording, pErr, err := leaseholderStore.ManuallyEnqueue( + ctx, "raftsnapshot", leaseholderRepl, false, /* skipShouldQueue */ + ) + require.NoError(t, pErr) + require.NoError(t, err) + trace := recording.String() + require.Regexp(t, "streamed VIA_SNAPSHOT_QUEUE snapshot.*to.*NON_VOTER", trace) + require.NoError(t, g.Wait()) } func TestSplitWithLearnerOrJointConfig(t *testing.T) { @@ -558,7 +628,7 @@ func TestRaftSnapshotQueueSeesLearner(t *testing.T) { if processErr != nil { return processErr } - const msg = `skipping snapshot; replica is likely a learner in the process of being added: (n2,s2):2LEARNER` + const msg = `skipping snapshot; replica is likely a LEARNER in the process of being added: (n2,s2):2LEARNER` formattedTrace := trace.String() if !strings.Contains(formattedTrace, msg) { return errors.Errorf(`expected "%s" in trace got:\n%s`, msg, formattedTrace) diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 32251889af88..276deedb72d0 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -1327,7 +1327,7 @@ func (r *Replica) completeSnapshotLogTruncationConstraint( return } - deadline := now.Add(raftLogQueuePendingSnapshotGracePeriod) + deadline := now.Add(RaftLogQueuePendingSnapshotGracePeriod) item.deadline = deadline r.mu.snapshotLogTruncationConstraints[snapUUID] = item } diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index ed647ae043a1..59378df61af4 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -190,20 +190,29 @@ type StoreTestingKnobs struct { // acquiring snapshot quota or doing shouldAcceptSnapshotData checks. If an // error is returned from the hook, it's sent as an ERROR SnapshotResponse. ReceiveSnapshot func(*SnapshotRequest_Header) error - // ReplicaAddSkipRollback causes replica addition to skip the learner rollback - // that happens when promotion to a voter fails. + // ReplicaAddSkipLearnerRollback causes replica addition to skip the learner + // rollback that happens when either the initial snapshot or the promotion of + // a learner to a voter fails. ReplicaAddSkipLearnerRollback func() bool - // ReplicaAddStopAfterLearnerSnapshot causes replica addition to return early + // VoterAddStopAfterLearnerSnapshot causes voter addition to return early // if the func returns true. Specifically, after the learner txn is successful // and after the LEARNER type snapshot, but before promoting it to a voter. // This ensures the `*Replica` will be materialized on the Store when it // returns. - ReplicaAddStopAfterLearnerSnapshot func([]roachpb.ReplicationTarget) bool - // ReplicaSkipLearnerSnapshot causes snapshots to never be sent to learners - // if the func returns true. Adding replicas proceeds as usual, though if - // the added replica has no prior state which can be caught up from the raft - // log, the result will be an voter that is unable to participate in quorum. - ReplicaSkipLearnerSnapshot func() bool + VoterAddStopAfterLearnerSnapshot func([]roachpb.ReplicationTarget) bool + // NonVoterAfterInitialization is called after a newly added non-voting + // replica receives its initial snapshot. Note that this knob _can_ be used in + // conjunction with ReplicaSkipInitialSnapshot. + NonVoterAfterInitialization func() + // ReplicaSkipInitialSnapshot causes snapshots to never be sent to learners or + // non-voters if the func returns true. Adding replicas proceeds as usual, + // though if an added voter has no prior state which can be caught up from the + // raft log, the result will be an voter that is unable to participate in + // quorum. + ReplicaSkipInitialSnapshot func() bool + // RaftSnapshotQueueSkipReplica causes the raft snapshot queue to skip sending + // a snapshot to a follower replica. + RaftSnapshotQueueSkipReplica func() bool // VoterAddStopAfterJointConfig causes voter addition to return early if // the func returns true. This happens before transitioning out of a joint // configuration, after the joint configuration has been entered by means diff --git a/pkg/roachpb/metadata_replicas.go b/pkg/roachpb/metadata_replicas.go index 77e61e9bad9e..5709564fc487 100644 --- a/pkg/roachpb/metadata_replicas.go +++ b/pkg/roachpb/metadata_replicas.go @@ -164,7 +164,7 @@ func (d ReplicaSet) VoterDescriptors() []ReplicaDescriptor { // // At the time of writing, learners are used in CockroachDB as an interim state // while adding a replica. A learner replica is added to the range via raft -// ConfChange, a raft snapshot (of type LEARNER_INITIAL) is sent to catch it up, and +// ConfChange, a raft snapshot (of type INITIAL) is sent to catch it up, and // then a second ConfChange promotes it to a full replica. // // This means that learners are currently always expected to have a short @@ -240,28 +240,12 @@ func (d ReplicaSet) LearnerDescriptors() []ReplicaDescriptor { } // NonVoters returns a ReplicaSet containing only the non-voters in `d`. -// Non-voting replicas are treated differently from learner replicas. -// Learners are a temporary internal state used to make atomic -// replication changes less disruptive to the system. Even though learners and -// non-voting replicas are both etcd/raft LearnerNodes under the hood, -// non-voting replicas are meant to be a user-visible state and are explicitly -// chosen to be placed inside certain localities via zone configs. -// -// Key differences between how we treat (ephemeral) learners and (persistent) -// non-voting replicas: - Non-voting replicas rely on the raft snapshot queue in -// order to upreplicate. This is different from the way learner replicas -// upreplicate (see comment above) because of the various (necessary) -// complexities / race-conditions we've discovered between the raft snapshot -// queue and the separately-issued initial LEARNER_INITIAL snapshot (see the two -// paragraphs above [*]). This complexity was necessary in case of learner -// replicas because we _need to know_ when they finish upreplication so that we -// can initiate their promotion into full voters. We don't have a similar -// requirement for non-voting replicas and we're choosing to avoid all the -// complexity. -// -// TODO(aayush): Expand this documentation once `AdminRelocateRange` knows how -// to deal with such replicas & range merges no longer block due to the presence -// of non-voting replicas. +// Non-voting replicas are treated differently from learner replicas. Learners +// are a temporary internal state used to make atomic replication changes less +// disruptive to the system. Even though learners and non-voting replicas are +// both etcd/raft LearnerNodes under the hood, non-voting replicas are meant to +// be a user-visible state and are explicitly chosen to be placed inside certain +// localities via zone configs. func (d ReplicaSet) NonVoters() ReplicaSet { return d.Filter(predNonVoter) } diff --git a/pkg/testutils/serverutils/test_cluster_shim.go b/pkg/testutils/serverutils/test_cluster_shim.go index c8bc71157c69..05c767286007 100644 --- a/pkg/testutils/serverutils/test_cluster_shim.go +++ b/pkg/testutils/serverutils/test_cluster_shim.go @@ -85,7 +85,8 @@ type TestClusterInterface interface { // //This method blocks until the new replicas become a part of the Raft group. AddNonVoters( - startKey roachpb.Key, targets ...roachpb.ReplicationTarget, + startKey roachpb.Key, + targets ...roachpb.ReplicationTarget, ) (roachpb.RangeDescriptor, error) // AddNonVotersOrFatal is the same as AddNonVoters but will fatal if it fails.