-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
replica_command.go
3365 lines (3155 loc) · 134 KB
/
replica_command.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvserver
import (
"bytes"
"context"
"fmt"
"math/rand"
"sort"
"time"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/rpc"
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3/raftpb"
"go.etcd.io/etcd/raft/v3/tracker"
)
// mergeApplicationTimeout is the timeout when waiting for a merge command to be
// applied on all range replicas. There doesn't appear to be any strong reason
// why this value was chosen in particular, but it seems to work.
const mergeApplicationTimeout = 5 * time.Second
// sendSnapshotTimeout is the timeout for sending snapshots. While a snapshot is
// in transit, Raft log truncation is halted to allow the recipient to catch up.
// If the snapshot takes very long to transfer for whatever reason this can
// cause the Raft log to grow very large. We therefore set a conservative
// timeout to eventually allow Raft log truncation while avoiding snapshot
// starvation -- even if another snapshot is sent immediately, this still
// allows truncation up to the new snapshot index.
var sendSnapshotTimeout = envutil.EnvOrDefaultDuration(
"COCKROACH_RAFT_SEND_SNAPSHOT_TIMEOUT", 1*time.Hour)
// AdminSplit divides the range into into two ranges using args.SplitKey.
func (r *Replica) AdminSplit(
ctx context.Context, args roachpb.AdminSplitRequest, reason string,
) (reply roachpb.AdminSplitResponse, _ *roachpb.Error) {
if len(args.SplitKey) == 0 {
return roachpb.AdminSplitResponse{}, roachpb.NewErrorf("cannot split range with no key provided")
}
err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
var err error
reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason)
return err
})
return reply, err
}
func maybeDescriptorChangedError(
desc *roachpb.RangeDescriptor, err error,
) (ok bool, expectedDesc *roachpb.RangeDescriptor) {
if detail := (*roachpb.ConditionFailedError)(nil); errors.As(err, &detail) {
// Provide a better message in the common case that the range being changed
// was already changed by a concurrent transaction.
var actualDesc roachpb.RangeDescriptor
if !detail.ActualValue.IsPresent() {
return true, nil
} else if err := detail.ActualValue.GetProto(&actualDesc); err == nil && !desc.Equal(&actualDesc) {
return true, &actualDesc
}
}
return false, nil
}
func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) string {
var s string
if status != nil && status.RaftState == raft.StateLeader {
for replicaID, pr := range status.Progress {
if replicaID == status.Lead {
// TODO(tschottdorf): remove this line once we have picked up
// https://github.com/etcd-io/etcd/pull/10279
continue
}
if pr.State == tracker.StateReplicate {
// This follower is in good working order.
continue
}
s += fmt.Sprintf("; r%d/%d is ", rangeID, replicaID)
switch pr.State {
case tracker.StateSnapshot:
// If the Raft snapshot queue is backed up, replicas can spend
// minutes or worse until they are caught up.
s += "waiting for a Raft snapshot"
case tracker.StateProbe:
// Assuming the split has already been delayed for a little bit,
// seeing a follower that is probing hints at some problem with
// Raft or Raft message delivery. (Of course it's possible that
// the follower *just* entered probing state).
s += "being probed (may or may not need a Raft snapshot)"
default:
// Future proofing.
s += "in unknown state " + pr.State.String()
}
}
}
return s
}
// prepareSplitDescs returns the left and right descriptor of the split whose
// right side is assigned rightRangeID and starts at splitKey. The supplied
// expiration is the "sticky bit" stored on the right descriptor.
func prepareSplitDescs(
ctx context.Context,
st *cluster.Settings,
rightRangeID roachpb.RangeID,
splitKey roachpb.RKey,
expiration hlc.Timestamp,
leftDesc *roachpb.RangeDescriptor,
) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor) {
// Create right hand side range descriptor.
rightDesc := roachpb.NewRangeDescriptor(rightRangeID, splitKey, leftDesc.EndKey, leftDesc.Replicas())
// Init updated version of existing range descriptor.
{
tmp := *leftDesc
leftDesc = &tmp
}
leftDesc.IncrementGeneration()
leftDesc.EndKey = splitKey
// Set the generation of the right hand side descriptor to match that of the
// (updated) left hand side. See the comment on the field for an explanation
// of why generations are useful.
rightDesc.Generation = leftDesc.Generation
setStickyBit(rightDesc, expiration)
return leftDesc, rightDesc
}
func setStickyBit(desc *roachpb.RangeDescriptor, expiration hlc.Timestamp) {
// TODO(jeffreyxiao): Remove this check in 20.1.
// Note that the client API for splitting has expiration time as
// non-nullable, but the internal representation of a sticky bit is nullable
// for backwards compatibility. If expiration time is the zero timestamp, we
// must be sure not to set the sticky bit to the zero timestamp because the
// byte representation of setting the stickyBit to nil is different than
// setting it to hlc.Timestamp{}. This check ensures that CPuts would not
// fail on older versions.
if !expiration.IsEmpty() {
desc.StickyBit = &expiration
}
}
func splitTxnAttempt(
ctx context.Context,
store *Store,
txn *kv.Txn,
rightRangeID roachpb.RangeID,
splitKey roachpb.RKey,
expiration hlc.Timestamp,
oldDesc *roachpb.RangeDescriptor,
) error {
txn.SetDebugName(splitTxnName)
_, dbDescValue, err := conditionalGetDescValueFromDB(
ctx, txn, oldDesc.StartKey, false /* forUpdate */, checkDescsEqual(oldDesc))
if err != nil {
return err
}
// TODO(tbg): return desc from conditionalGetDescValueFromDB and don't pass
// in oldDesc any more (just the start key).
desc := oldDesc
oldDesc = nil // prevent accidental use
leftDesc, rightDesc := prepareSplitDescs(
ctx, store.ClusterSettings(), rightRangeID, splitKey, expiration, desc)
// Update existing range descriptor for left hand side of
// split. Note that we mutate the descriptor for the left hand
// side of the split first to locate the txn record there.
{
b := txn.NewBatch()
leftDescKey := keys.RangeDescriptorKey(leftDesc.StartKey)
if err := updateRangeDescriptor(ctx, b, leftDescKey, dbDescValue, leftDesc); err != nil {
return err
}
// Commit this batch first to ensure that the transaction record
// is created in the right place (split trigger relies on this).
// Sending the batch containing only the first write guarantees
// the transaction record is written first, preventing cases
// where splits are aborted early due to conflicts with meta
// intents (see #9265).
log.Event(ctx, "updating LHS descriptor")
if err := txn.Run(ctx, b); err != nil {
return err
}
}
// Log the split into the range event log.
if err := store.logSplit(ctx, txn, *leftDesc, *rightDesc); err != nil {
return err
}
b := txn.NewBatch()
// Write range descriptor for right hand side of the split.
rightDescKey := keys.RangeDescriptorKey(rightDesc.StartKey)
if err := updateRangeDescriptor(ctx, b, rightDescKey, nil, rightDesc); err != nil {
return err
}
// Update range descriptor addressing record(s).
if err := splitRangeAddressing(b, rightDesc, leftDesc); err != nil {
return err
}
// End the transaction manually, instead of letting RunTransaction
// loop do it, in order to provide a split trigger.
b.AddRawRequest(&roachpb.EndTxnRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
SplitTrigger: &roachpb.SplitTrigger{
LeftDesc: *leftDesc,
RightDesc: *rightDesc,
},
},
})
// Commit txn with final batch (RHS descriptor and meta).
log.Event(ctx, "commit txn with batch containing RHS descriptor and meta records")
return txn.Run(ctx, b)
}
func splitTxnStickyUpdateAttempt(
ctx context.Context, txn *kv.Txn, desc *roachpb.RangeDescriptor, expiration hlc.Timestamp,
) error {
_, dbDescValue, err := conditionalGetDescValueFromDB(
ctx, txn, desc.StartKey, false /* forUpdate */, checkDescsEqual(desc))
if err != nil {
return err
}
newDesc := *desc
setStickyBit(&newDesc, expiration)
b := txn.NewBatch()
descKey := keys.RangeDescriptorKey(desc.StartKey)
if err := updateRangeDescriptor(ctx, b, descKey, dbDescValue, &newDesc); err != nil {
return err
}
if err := updateRangeAddressing(b, &newDesc); err != nil {
return err
}
// End the transaction manually, instead of letting RunTransaction loop
// do it, in order to provide a sticky bit trigger.
b.AddRawRequest(&roachpb.EndTxnRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
StickyBitTrigger: &roachpb.StickyBitTrigger{
StickyBit: newDesc.GetStickyBit(),
},
},
})
return txn.Run(ctx, b)
}
// adminSplitWithDescriptor divides the range into into two ranges, using
// either args.SplitKey (if provided) or an internally computed key that aims
// to roughly equipartition the range by size. The split is done inside of a
// distributed txn which writes updated left and new right hand side range
// descriptors, and updates the range addressing metadata. The handover of
// responsibility for the reassigned key range is carried out seamlessly
// through a split trigger carried out as part of the commit of that
// transaction.
//
// The supplied RangeDescriptor is used as a form of optimistic lock. An
// operation which might split a range should obtain a copy of the range's
// current descriptor before making the decision to split. If the decision is
// affirmative the descriptor is passed to AdminSplit, which performs a
// Conditional Put on the RangeDescriptor to ensure that no other operation has
// modified the range in the time the decision was being made.
// TODO(tschottdorf): should assert that split key is not a local key.
//
// See the comment on splitTrigger for details on the complexities.
func (r *Replica) adminSplitWithDescriptor(
ctx context.Context,
args roachpb.AdminSplitRequest,
desc *roachpb.RangeDescriptor,
delayable bool,
reason string,
) (roachpb.AdminSplitResponse, error) {
var err error
var reply roachpb.AdminSplitResponse
// The split queue doesn't care about the set of replicas, so if we somehow
// are being handed one that's in a joint state, finalize that before
// continuing.
desc, err = r.maybeLeaveAtomicChangeReplicas(ctx, desc)
if err != nil {
return reply, err
}
// Determine split key if not provided with args. This scan is
// allowed to be relatively slow because admin commands don't block
// other commands.
log.Event(ctx, "split begins")
var splitKey roachpb.RKey
{
var foundSplitKey roachpb.Key
if len(args.SplitKey) == 0 {
// Find a key to split by size.
var err error
targetSize := r.GetMaxBytes() / 2
foundSplitKey, err = storage.MVCCFindSplitKey(
ctx, r.store.engine, desc.StartKey, desc.EndKey, targetSize)
if err != nil {
return reply, errors.Wrap(err, "unable to determine split key")
}
if foundSplitKey == nil {
// No suitable split key could be found.
return reply, unsplittableRangeError{}
}
} else {
// If the key that routed this request to this range is now out of this
// range's bounds, return an error for the client to try again on the
// correct range.
if !kvserverbase.ContainsKey(desc, args.Key) {
ri := r.GetRangeInfo(ctx)
return reply, roachpb.NewRangeKeyMismatchErrorWithCTPolicy(ctx, args.Key, args.Key, desc, &ri.Lease, ri.ClosedTimestampPolicy)
}
foundSplitKey = args.SplitKey
}
if !kvserverbase.ContainsKey(desc, foundSplitKey) {
return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r)
}
// If predicate keys are specified, make sure they are contained by this
// range as well.
for _, k := range args.PredicateKeys {
if !kvserverbase.ContainsKey(desc, k) {
return reply, errors.Errorf("requested predicate key %s out of bounds of %s", k, r)
}
}
var err error
splitKey, err = keys.Addr(foundSplitKey)
if err != nil {
return reply, err
}
if !splitKey.Equal(foundSplitKey) {
return reply, errors.Errorf("cannot split range at range-local key %s", splitKey)
}
if !storage.IsValidSplitKey(foundSplitKey) {
return reply, errors.Errorf("cannot split range at key %s", splitKey)
}
}
// If the range starts at the splitKey, we treat the AdminSplit
// as a no-op and return success instead of throwing an error.
if desc.StartKey.Equal(splitKey) {
if len(args.SplitKey) == 0 {
log.Fatal(ctx, "MVCCFindSplitKey returned start key of range")
}
log.Event(ctx, "range already split")
// Even if the range is already split, we should still update the sticky
// bit if it has a later expiration time.
if desc.GetStickyBit().Less(args.ExpirationTime) {
err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
return splitTxnStickyUpdateAttempt(ctx, txn, desc, args.ExpirationTime)
})
// The ConditionFailedError can occur because the descriptors acting as
// expected values in the CPuts used to update the range descriptor are
// picked outside the transaction. Return ConditionFailedError in the
// error detail so that the command can be retried.
if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
// NB: we have to wrap the existing error here as consumers of this code
// look at the root cause to sniff out the changed descriptor.
err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
}
return reply, err
}
return reply, nil
}
log.Event(ctx, "found split key")
// Create right hand side range descriptor.
rightRangeID, err := r.store.AllocateRangeID(ctx)
if err != nil {
return reply, errors.Wrap(err, "unable to allocate range id for right hand side")
}
var extra string
if delayable {
extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
}
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
log.Infof(ctx, "initiating a split of this range at key %s [r%d] (%s)%s",
splitKey.StringWithDirs(nil /* valDirs */, 50 /* maxLen */), rightRangeID, reason, extra)
if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
return splitTxnAttempt(ctx, r.store, txn, rightRangeID, splitKey, args.ExpirationTime, desc)
}); err != nil {
// The ConditionFailedError can occur because the descriptors acting
// as expected values in the CPuts used to update the left or right
// range descriptors are picked outside the transaction. Return
// ConditionFailedError in the error detail so that the command can be
// retried.
if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
// NB: we have to wrap the existing error here as consumers of this code
// look at the root cause to sniff out the changed descriptor.
err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
}
return reply, errors.Wrapf(err, "split at key %s failed", splitKey)
}
return reply, nil
}
// AdminUnsplit removes the sticky bit of the range specified by the
// args.Key.
func (r *Replica) AdminUnsplit(
ctx context.Context, args roachpb.AdminUnsplitRequest, reason string,
) (roachpb.AdminUnsplitResponse, *roachpb.Error) {
var reply roachpb.AdminUnsplitResponse
err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
var err error
reply, err = r.adminUnsplitWithDescriptor(ctx, args, desc, reason)
return err
})
return reply, err
}
func (r *Replica) adminUnsplitWithDescriptor(
ctx context.Context,
args roachpb.AdminUnsplitRequest,
desc *roachpb.RangeDescriptor,
reason string,
) (roachpb.AdminUnsplitResponse, error) {
var reply roachpb.AdminUnsplitResponse
if !bytes.Equal(desc.StartKey.AsRawKey(), args.Header().Key) {
return reply, errors.Errorf("key %s is not the start of a range", args.Header().Key)
}
// If the range's sticky bit is already hlc.Timestamp{}, we treat the unsplit
// command as a no-op and return success instead of throwing an error. On
// mixed version clusters that don't support StickyBit, all range descriptor
// sticky bits are guaranteed to be nil, so we can skip checking the cluster
// version.
if desc.GetStickyBit().IsEmpty() {
return reply, nil
}
if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
_, dbDescValue, err := conditionalGetDescValueFromDB(
ctx, txn, desc.StartKey, false /* forUpdate */, checkDescsEqual(desc))
if err != nil {
return err
}
newDesc := *desc
// Use nil instead of &zero until 20.1; this field is new in 19.2. We
// could use &zero here because the sticky bit will never be populated
// before the cluster version reaches 19.2 and the early return above
// already handles that case, but nothing is won in doing so.
newDesc.StickyBit = nil
descKey := keys.RangeDescriptorKey(newDesc.StartKey)
b := txn.NewBatch()
if err := updateRangeDescriptor(ctx, b, descKey, dbDescValue, &newDesc); err != nil {
return err
}
if err := updateRangeAddressing(b, &newDesc); err != nil {
return err
}
// End the transaction manually in order to provide a sticky bit trigger.
b.AddRawRequest(&roachpb.EndTxnRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
StickyBitTrigger: &roachpb.StickyBitTrigger{
// Setting StickyBit to the zero timestamp ensures that it is always
// eligible for automatic merging.
StickyBit: hlc.Timestamp{},
},
},
})
return txn.Run(ctx, b)
}); err != nil {
// The ConditionFailedError can occur because the descriptors acting as
// expected values in the CPuts used to update the range descriptor are
// picked outside the transaction. Return ConditionFailedError in the error
// detail so that the command can be retried.
if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
// NB: we have to wrap the existing error here as consumers of this code
// look at the root cause to sniff out the changed descriptor.
err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
}
return reply, err
}
return reply, nil
}
// executeAdminCommandWithDescriptor wraps a read-modify-write operation for RangeDescriptors in a
// retry loop.
func (r *Replica) executeAdminCommandWithDescriptor(
ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error,
) *roachpb.Error {
// Retry forever as long as we see errors we know will resolve.
retryOpts := base.DefaultRetryOptions()
// Randomize quite a lot just in case someone else also interferes with us
// in a retry loop. Note that this is speculative; there wasn't an incident
// that suggested this.
retryOpts.RandomizationFactor = 0.5
var lastErr error
for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
// The replica may have been destroyed since the start of the retry loop.
// We need to explicitly check this condition. Having a valid lease, as we
// verify below, does not imply that the range still exists: even after a
// range has been merged into its left-hand neighbor, its final lease
// (i.e., the lease we have in r.mu.state.Lease) can remain valid
// indefinitely.
if _, err := r.IsDestroyed(); err != nil {
return roachpb.NewError(err)
}
// Admin commands always require the range lease to begin (see
// executeAdminBatch), but we may have lost it while in this retry loop.
// Without the lease, a replica's local descriptor can be arbitrarily
// stale, which will result in a ConditionFailedError. To avoid this, we
// make sure that we still have the lease before each attempt.
if _, pErr := r.redirectOnOrAcquireLease(ctx); pErr != nil {
return pErr
}
lastErr = updateDesc(r.Desc())
// On seeing a ConditionFailedError or an AmbiguousResultError, retry the
// command with the updated descriptor.
if !errors.HasType(lastErr, (*roachpb.ConditionFailedError)(nil)) &&
!errors.HasType(lastErr, (*roachpb.AmbiguousResultError)(nil)) {
break
}
}
return roachpb.NewError(lastErr)
}
// AdminMerge extends this range to subsume the range that comes next
// in the key space. The merge is performed inside of a distributed
// transaction which writes the left hand side range descriptor (the
// subsuming range) and deletes the range descriptor for the right
// hand side range (the subsumed range). It also updates the range
// addressing metadata. The handover of responsibility for the
// reassigned key range is carried out seamlessly through a merge
// trigger carried out as part of the commit of that transaction. A
// merge requires that the two ranges are collocated on the same set
// of replicas.
//
// The supplied RangeDescriptor is used as a form of optimistic lock. See the
// comment of "AdminSplit" for more information on this pattern.
func (r *Replica) AdminMerge(
ctx context.Context, args roachpb.AdminMergeRequest, reason string,
) (roachpb.AdminMergeResponse, *roachpb.Error) {
var reply roachpb.AdminMergeResponse
runMergeTxn := func(txn *kv.Txn) error {
log.Event(ctx, "merge txn begins")
txn.SetDebugName(mergeTxnName)
// Pipelining might send QueryIntent requests to the RHS after the RHS has
// noticed the merge and started blocking all traffic. This causes the merge
// transaction to deadlock. Just turn pipelining off; the structure of the
// merge transaction means pipelining provides no performance benefit
// anyway.
if err := txn.DisablePipelining(); err != nil {
return err
}
origLeftDesc := r.Desc()
if origLeftDesc.EndKey.Equal(roachpb.RKeyMax) {
// Merging the final range doesn't make sense.
return errors.New("cannot merge final range")
}
// Retrieve the current left hand side's range descriptor and confirm
// that it matches our expectation. Do so using a locking read. Locking
// the descriptor early (i.e. on the read instead of the write of the
// read-modify-write operation) helps prevent multiple concurrent Range
// merges from thrashing. Thrashing is especially detrimental for Range
// merges because any restart results in an abort (see the retry loop
// below), so thrashing can result in indefinite livelock.
//
// Because this is a locking read, this also dictates the location of
// the merge's transaction record. It is critical to the range merge
// protocol that the transaction record be placed on the the left hand
// side's descriptor, as the MergeTrigger depends on this.
_, dbOrigLeftDescValue, err := conditionalGetDescValueFromDB(
ctx, txn, origLeftDesc.StartKey, true /* forUpdate */, checkDescsEqual(origLeftDesc))
if err != nil {
return err
}
// Do a consistent read of the right hand side's range descriptor.
// Again, use a locking read because we intend to update this key
// shortly.
var rightDesc roachpb.RangeDescriptor
rightDescKey := keys.RangeDescriptorKey(origLeftDesc.EndKey)
dbRightDescKV, err := txn.GetForUpdate(ctx, rightDescKey)
if err != nil {
return err
}
if err := dbRightDescKV.ValueProto(&rightDesc); err != nil {
return err
}
// Verify that the two ranges are mergeable.
if !bytes.Equal(origLeftDesc.EndKey, rightDesc.StartKey) {
// Should never happen, but just in case.
return errors.Errorf("ranges are not adjacent; %s != %s", origLeftDesc.EndKey, rightDesc.StartKey)
}
// For simplicity, don't handle learner replicas or joint states, expect
// the caller to resolve them first. (Defensively, we check that there
// are no non-voter replicas, in case some third type is later added).
// This behavior can be changed later if the complexity becomes worth
// it, but it's not right now.
//
// NB: the merge queue transitions out of any joint states and removes
// any learners it sees. It's sort of silly that we don't do that here
// instead; effectively any caller of AdminMerge that is not the merge
// queue won't be able to recover from these cases (though the replicate
// queues should fix things up quickly).
lReplicas, rReplicas := origLeftDesc.Replicas(), rightDesc.Replicas()
if len(lReplicas.VoterFullAndNonVoterDescriptors()) != len(lReplicas.Descriptors()) {
return errors.Errorf("cannot merge ranges when lhs is in a joint state or has learners: %s",
lReplicas)
}
if len(rReplicas.VoterFullAndNonVoterDescriptors()) != len(rReplicas.Descriptors()) {
return errors.Errorf("cannot merge ranges when rhs is in a joint state or has learners: %s",
rReplicas)
}
if !replicasCollocated(lReplicas.Descriptors(), rReplicas.Descriptors()) {
return errors.Errorf("ranges not collocated; %s != %s", lReplicas, rReplicas)
}
// Ensure that every current replica of the LHS has been initialized.
// Otherwise there is a rare race where the replica GC queue can GC a
// replica of the RHS too early. The comment on
// TestStoreRangeMergeUninitializedLHSFollower explains the situation in full.
if err := waitForReplicasInit(
ctx, r.store.cfg.NodeDialer, origLeftDesc.RangeID, origLeftDesc.Replicas().Descriptors(),
); err != nil {
return errors.Wrap(err, "waiting for all left-hand replicas to initialize")
}
// Out of an abundance of caution, also ensure that replicas of the RHS have
// all been initialized. If for whatever reason the initial upreplication
// snapshot for a NON_VOTER on the RHS fails, it will have to get picked up
// by the raft snapshot queue to upreplicate and may be uninitialized at
// this point. As such, if we send a subsume request to the RHS in this sort
// of state, we will wastefully and unintentionally block all traffic on it
// for 5 seconds.
if err := waitForReplicasInit(
ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, rightDesc.Replicas().Descriptors(),
); err != nil {
return errors.Wrap(err, "waiting for all right-hand replicas to initialize")
}
mergeReplicas := lReplicas.Descriptors()
updatedLeftDesc := *origLeftDesc
// lhs.Generation = max(rhs.Generation, lhs.Generation)+1.
// See the comment on the Generation field for why generation are useful.
if updatedLeftDesc.Generation < rightDesc.Generation {
updatedLeftDesc.Generation = rightDesc.Generation
}
updatedLeftDesc.IncrementGeneration()
updatedLeftDesc.EndKey = rightDesc.EndKey
log.Infof(ctx, "initiating a merge of %s into this range (%s)", &rightDesc, reason)
// Log the merge into the range event log.
// TODO(spencer): event logging API should accept a batch
// instead of a transaction; there's no reason this logging
// shouldn't be done in parallel via the batch with the updated
// range addressing.
if err := r.store.logMerge(ctx, txn, updatedLeftDesc, rightDesc); err != nil {
return err
}
b := txn.NewBatch()
// Update the meta addressing records.
if err := mergeRangeAddressing(b, origLeftDesc, &updatedLeftDesc); err != nil {
return err
}
// Update the range descriptor for the receiving range.
leftDescKey := keys.RangeDescriptorKey(updatedLeftDesc.StartKey)
if err := updateRangeDescriptor(ctx, b, leftDescKey,
dbOrigLeftDescValue, /* oldValue */
&updatedLeftDesc, /* newDesc */
); err != nil {
return err
}
// Remove the range descriptor for the deleted range.
if err := updateRangeDescriptor(ctx, b, rightDescKey,
dbRightDescKV.Value.TagAndDataBytes(), /* oldValue */
nil, /* newDesc */
); err != nil {
return err
}
// Send off this batch, ensuring that intents are placed on both the local
// copy and meta2's copy of the right-hand side range descriptor before we
// send the Subsume request below. This is the precondition for sending a
// Subsume request; see the godoc on batcheval.Subsume for details.
if err := txn.Run(ctx, b); err != nil {
return err
}
// Intents have been placed, so the merge is now in its critical phase. Get
// a consistent view of the data from the right-hand range. If the merge
// commits, we'll write this data to the left-hand range in the merge
// trigger.
br, pErr := kv.SendWrapped(ctx, r.store.DB().NonTransactionalSender(),
&roachpb.SubsumeRequest{
RequestHeader: roachpb.RequestHeader{Key: rightDesc.StartKey.AsRawKey()},
LeftDesc: *origLeftDesc,
RightDesc: rightDesc,
})
if pErr != nil {
return pErr.GoError()
}
rhsSnapshotRes := br.(*roachpb.SubsumeResponse)
err = contextutil.RunWithTimeout(ctx, "waiting for merge application", mergeApplicationTimeout,
func(ctx context.Context) error {
return waitForApplication(ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas,
rhsSnapshotRes.LeaseAppliedIndex)
})
if err != nil {
return errors.Wrap(err, "waiting for all right-hand replicas to catch up")
}
// Successful subsume, so we're guaranteed that the right-hand range will
// not serve another request unless this transaction aborts. End the
// transaction manually in order to provide a merge trigger.
b = txn.NewBatch()
b.AddRawRequest(&roachpb.EndTxnRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
MergeTrigger: &roachpb.MergeTrigger{
LeftDesc: updatedLeftDesc,
RightDesc: rightDesc,
RightMVCCStats: rhsSnapshotRes.MVCCStats,
FreezeStart: rhsSnapshotRes.FreezeStart,
RightClosedTimestamp: rhsSnapshotRes.ClosedTimestamp,
RightReadSummary: rhsSnapshotRes.ReadSummary,
},
},
})
log.Event(ctx, "attempting commit")
return txn.Run(ctx, b)
}
// If the merge transaction encounters an error, we need to trigger a full
// abort and try again with a new transaction. Why? runMergeTxn has the side
// effect of sending a Subsume request to the right-hand range, which blocks
// the right-hand range from serving any traffic until the transaction commits
// or aborts. If we retry using the same transaction (i.e., a "transaction
// restart"), we'll send requests to the blocked right-hand range and
// deadlock. The right-hand range will see that the transaction is still
// pending and refuse to respond, but the transaction cannot commit until the
// right-hand range responds. By instead marking the transaction as aborted,
// we'll unlock the right-hand range, giving the next, fresh transaction a
// chance to succeed.
//
// A second reason to eschew kv.DB.Txn() is that the API to disable pipelining
// is finicky and only allows disabling pipelining before any operations have
// been sent, even in prior epochs. Calling DisablePipelining() on a restarted
// transaction yields an error.
for {
txn := kv.NewTxn(ctx, r.store.DB(), r.NodeID())
err := runMergeTxn(txn)
if err != nil {
log.VEventf(ctx, 2, "merge txn failed: %s", err)
txn.CleanupOnError(ctx, err)
}
if !errors.HasType(err, (*roachpb.TransactionRetryWithProtoRefreshError)(nil)) {
if err != nil {
return reply, roachpb.NewErrorf("merge failed: %s", err)
}
return reply, nil
}
}
}
func waitForApplication(
ctx context.Context,
dialer *nodedialer.Dialer,
rangeID roachpb.RangeID,
replicas []roachpb.ReplicaDescriptor,
leaseIndex uint64,
) error {
if dialer == nil && len(replicas) == 1 {
// This early return supports unit tests (testContext{}) that also
// want to perform merges.
return nil
}
g := ctxgroup.WithContext(ctx)
for _, repl := range replicas {
repl := repl // copy for goroutine
g.GoCtx(func(ctx context.Context) error {
conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
if err != nil {
return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
}
_, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{
StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
RangeID: rangeID,
LeaseIndex: leaseIndex,
})
return err
})
}
return g.Wait()
}
// waitForReplicasInit blocks until it has proof that the replicas listed in
// desc are initialized on their respective stores. It may return a false
// negative, i.e., claim that a replica is uninitialized when it is, in fact,
// initialized, but it will never return a false positive.
func waitForReplicasInit(
ctx context.Context,
dialer *nodedialer.Dialer,
rangeID roachpb.RangeID,
replicas []roachpb.ReplicaDescriptor,
) error {
if dialer == nil && len(replicas) == 1 {
// This early return supports unit tests (testContext{}) that also
// want to perform merges.
return nil
}
return contextutil.RunWithTimeout(ctx, "wait for replicas init", 5*time.Second, func(ctx context.Context) error {
g := ctxgroup.WithContext(ctx)
for _, repl := range replicas {
repl := repl // copy for goroutine
g.GoCtx(func(ctx context.Context) error {
conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
if err != nil {
return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
}
_, err = NewPerReplicaClient(conn).WaitForReplicaInit(ctx, &WaitForReplicaInitRequest{
StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
RangeID: rangeID,
})
return err
})
}
return g.Wait()
})
}
// ChangeReplicas atomically changes the replicas that are members of a range.
// The change is performed in a distributed transaction and takes effect when
// that transaction is committed. This transaction confirms that the supplied
// RangeDescriptor is up to date and that the supplied slice of
// ReplicationChanges is a valid transition, meaning that replicas being added
// are not present, that replicas being removed are present, that no replica is
// altered more than once, and that no attempt is made at removing the
// leaseholder (which in particular implies that we can never remove all
// replicas).
//
// The returned RangeDescriptor is the new value of the range's descriptor
// following the successful commit of the transaction.
//
// In general, ChangeReplicas will carry out the following steps.
//
// 1. Run a distributed transaction that adds all new replicas as learner replicas.
// Learner replicas receive the log, but do not have voting rights. They are
// used to catch up these new replicas before turning them into voters, which
// is important for the continued availability of the range throughout the
// replication change. Learners are added (and removed) one by one due to a
// technicality (see https://github.com/cockroachdb/cockroach/pull/40268).
//
// The distributed transaction updates both copies of the range descriptor
// (the one on the range and that in the meta ranges) to that effect, and
// commits with a special trigger instructing Raft (via ProposeConfChange) to
// tie a corresponding replication configuration change which goes into
// effect (on each replica) when the transaction commit is applied to the
// state. Applying the command also updates each replica's local view of
// the state to reflect the new descriptor.
//
// If no replicas are being added, this first step is elided. If non-voting
// replicas (which are also learners in etcd/raft) are being added, then this
// step is all we need. The rest of the steps only apply if voter replicas
// are being added.
//
// 2. Send Raft snapshots to all learner replicas. This would happen
// automatically by the existing recovery mechanisms (raft snapshot queue), but
// it is done explicitly as a convenient way to ensure learners are caught up
// before the next step is entered. (We ensure that work is not duplicated
// between the snapshot queue and the explicit snapshot via the
// snapshotLogTruncationConstraints map). Snapshots are subject to both
// bandwidth rate limiting and throttling.
//
// If no replicas are being added, this step is similarly elided.
//
// 3. Carry out a distributed transaction similar to that which added the
// learner replicas, except this time it (atomically) changes all learners to
// voters and removes any replicas for which this was requested; voters are
// demoted before actually being removed to avoid bug in etcd/raft:
// See https://github.com/cockroachdb/cockroach/pull/40268.
//
// If only one replica is being added, raft can chose the simple
// configuration change protocol; otherwise it has to use joint consensus. In
// this latter mechanism, a first configuration change is made which results
// in a configuration ("joint configuration") in which a quorum of both the
// old replicas and the new replica sets is required for decision making.
// Transitioning into this joint configuration, the RangeDescriptor (which is
// the source of truth of the replication configuration) is updated with
// corresponding replicas of type VOTER_INCOMING and VOTER_DEMOTING.
// Immediately after committing this change, a second transition updates the
// descriptor with and activates the final configuration.
//
// Concretely, if the initial members of the range are s1/1, s2/2, and s3/3, and
// an atomic membership change were to add s4/4 and s5/5 while removing s1/1 and
// s2/2, the following range descriptors would form the overall transition:
//
// 1. s1/1 s2/2 s3/3 (VOTER_FULL is implied)
// 2. s1/1 s2/2 s3/3 s4/4LEARNER
// 3. s1/1 s2/2 s3/3 s4/4LEARNER s5/5LEARNER
// 4. s1/1VOTER_DEMOTING_LEARNER s2/2VOTER_DEMOTING_LEARNER s3/3 s4/4VOTER_INCOMING s5/5VOTER_INCOMING
// 5. s1/1LEARNER s2/2LEARNER s3/3 s4/4 s5/5
// 6. s2/2LEARNER s3/3 s4/4 s5/5
// 7. s3/3 s4/4 s5/5
//
// A replica that learns that it was removed will queue itself for replicaGC.
// Note that a removed replica may never apply the configuration change removing
// itself and thus this trigger may not fire. This is because said replica may
// not have been a part of the quorum that committed the configuration change;
// nodes that apply the change will stop sending messages to the removed
// replica. At that point, the removed replica will typically campaign (since it
// receives no more heartbeats from the leader) and its former peers respond via
// a RaftGroupDeletedError (from the Raft transport) as a signal to queue to
// replicaGC. This second mechanism fails if all peers have rapidly moved
// elsewhere as well; in that last and rare case, replica GC queue will
// eventually discover the replica on its own; it has optimizations that handle
// "abandoned-looking" replicas more eagerly than healthy ones.
func (r *Replica) ChangeReplicas(
ctx context.Context,
desc *roachpb.RangeDescriptor,
priority kvserverpb.SnapshotRequest_Priority,
reason kvserverpb.RangeLogEventReason,
details string,
chgs roachpb.ReplicationChanges,
) (updatedDesc *roachpb.RangeDescriptor, _ error) {
if desc == nil {
// TODO(tbg): is this check just FUD?
return nil, errors.Errorf("%s: the current RangeDescriptor must not be nil", r)
}
// If in testing (for lack of a better mechanism, we restrict to race builds),
// try to catch tests that use manual replication while the replication queue
// is active. Such tests are often flaky.
if knobs := r.store.TestingKnobs(); util.RaceEnabled &&
!knobs.DisableReplicateQueue &&
!knobs.AllowUnsynchronizedReplicationChanges {
bq := r.store.replicateQueue.baseQueue
bq.mu.Lock()
disabled := bq.mu.disabled
bq.mu.Unlock()
if !disabled {
return nil, errors.New("must disable replicate queue to use ChangeReplicas manually")
}
}
return r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs)
}
func (r *Replica) changeReplicasImpl(
ctx context.Context,