-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
lease.go
2090 lines (1954 loc) · 75 KB
/
lease.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.
// Package lease provides functionality to create and manage sql schema leases.
package lease
import (
"context"
"fmt"
"sort"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/settingswatcher"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkeys"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descbuilder"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/internal/catkv"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/systemschema"
"github.com/cockroachdb/cockroach/pkg/sql/enum"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/regionliveness"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness"
kvstorage "github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/logcrash"
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/syncutil/singleflight"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
)
var errRenewLease = errors.New("renew lease on id")
var errReadOlderVersion = errors.New("read older descriptor version from store")
// LeaseDuration controls the duration of sql descriptor leases.
var LeaseDuration = settings.RegisterDurationSetting(
settings.ApplicationLevel,
"sql.catalog.descriptor_lease_duration",
"mean duration of sql descriptor leases, this actual duration is jitterred",
base.DefaultDescriptorLeaseDuration)
// LeaseJitterFraction controls the percent jitter around sql lease durations
var LeaseJitterFraction = settings.RegisterFloatSetting(
settings.ApplicationLevel,
"sql.catalog.descriptor_lease_jitter_fraction",
"mean duration of sql descriptor leases, this actual duration is jitterred",
base.DefaultDescriptorLeaseJitterFraction,
settings.Fraction)
var LeaseMonitorRangeFeedCheckInterval = settings.RegisterDurationSetting(
settings.ApplicationLevel,
"sql.catalog.descriptor_lease_monitor_range_feed.check_interval",
"the leasing subsystem will check for checkpoints for the range feed within "+
"this interval",
time.Minute*5,
)
var LeaseMonitorRangeFeedResetTime = settings.RegisterDurationSetting(
settings.ApplicationLevel,
"sql.catalog.descriptor_lease_monitor_range_feed.reset_time",
"once the range feed has stopped receiving checkpoints for this "+
"period of time the range feed will be restarted",
time.Minute*25,
)
//go:generate stringer -type=SessionBasedLeasingMode
type SessionBasedLeasingMode int64
const (
// SessionBasedLeasingAuto automatically pick a leasing mode
// based on the current version.
SessionBasedLeasingAuto SessionBasedLeasingMode = iota
// SessionBasedLeasingOff expiry based leasing is being used.
SessionBasedLeasingOff
// SessionBasedDualWrite expiry based and session based leasing are
// active concurrently, and both tables must be consulted schema changes.
SessionBasedDualWrite
// SessionBasedDrain expiry based leases will not be granted or renewed.
// Valid pre-existing leases that are expiry based will still be respected.
SessionBasedDrain
// SessionBasedOnly session based leases are only active, and schema
// changes only need to consult this table.
SessionBasedOnly
)
var (
// SessionBasedLeasingModeByName maps session based leasing modes from name
// to enum values.
SessionBasedLeasingModeByName = map[string]SessionBasedLeasingMode{
"auto": SessionBasedLeasingAuto,
"off": SessionBasedLeasingOff,
"dual_write": SessionBasedDualWrite,
"drain": SessionBasedDrain,
"session": SessionBasedOnly,
}
)
// LeaseEnableSessionBasedLeasing used to enable / disable support for
// session based leasing.
var LeaseEnableSessionBasedLeasing = settings.RegisterEnumSetting(
settings.ApplicationLevel,
"sql.catalog.experimental_use_session_based_leasing",
"enables session based leasing for internal testing.",
"auto",
map[SessionBasedLeasingMode]string{
SessionBasedLeasingAuto: "auto",
SessionBasedLeasingOff: "off",
SessionBasedDualWrite: "dual_write",
SessionBasedDrain: "drain",
SessionBasedOnly: "session",
},
)
// sessionBasedLeasingModeActive determines if the current mode at least meets
// the required minimum.
func (m *Manager) sessionBasedLeasingModeAtLeast(
ctx context.Context, minimumMode SessionBasedLeasingMode,
) bool {
return m.getSessionBasedLeasingMode(ctx) >= minimumMode
}
func readSessionBasedLeasingMode(
ctx context.Context, settings *cluster.Settings,
) SessionBasedLeasingMode {
// When leasing mode is set to OFF we will use the version to determine what
// mode we are executing in.
settingMode := LeaseEnableSessionBasedLeasing.Get(&settings.SV)
if settingMode == SessionBasedLeasingAuto {
return SessionBasedOnly
} else {
return settingMode
}
}
// getSessionBasedLeasingMode returns the current session based leasing mode.
func (m *Manager) getSessionBasedLeasingMode(ctx context.Context) SessionBasedLeasingMode {
return readSessionBasedLeasingMode(ctx, m.settings)
}
// WaitForNoVersion returns once there are no unexpired leases left
// for any version of the descriptor.
func (m *Manager) WaitForNoVersion(
ctx context.Context,
id descpb.ID,
cachedDatabaseRegions regionliveness.CachedDatabaseRegions,
retryOpts retry.Options,
) error {
versions := []IDVersion{
{
Name: fmt.Sprintf("[%d]", id),
ID: id,
Version: 0, // Unused any version flag used below.
},
}
// Increment the long wait gauge for wait for no version, if this function
// takes longer than the lease duration.
decAfterWait := m.IncGaugeAfterLeaseDuration(GaugeWaitForNoVersion)
defer decAfterWait()
wsTracker := startWaitStatsTracker(ctx)
defer wsTracker.end()
for lastCount, r := 0, retry.Start(retryOpts); r.Next(); {
now := m.storage.clock.Now()
detail, err := countLeasesWithDetail(ctx, m.storage.db, m.Codec(), cachedDatabaseRegions, m.settings, versions, now, true /*forAnyVersion*/)
if err != nil {
return err
}
if detail.count == 0 {
break
}
if detail.count != lastCount {
lastCount = detail.count
wsTracker.updateProgress(detail)
log.Infof(ctx, "waiting for %d leases to expire: desc=%d", detail.count, id)
}
if lastCount == 0 {
break
}
}
return nil
}
// WaitForOneVersion returns once there are no unexpired leases on the
// previous version of the descriptor. It returns the descriptor with the
// current version, though note that it will not be validated or hydrated.
//
// After returning there can only be versions of the descriptor >= to the
// returned version. Lease acquisition (see acquire()) maintains the
// invariant that no new leases for desc.Version-1 will be granted once
// desc.Version exists.
//
// If the descriptor is not found, an error will be returned. The error
// can be detected by using errors.Is(err, catalog.ErrDescriptorNotFound).
func (m *Manager) WaitForOneVersion(
ctx context.Context,
id descpb.ID,
regions regionliveness.CachedDatabaseRegions,
retryOpts retry.Options,
) (desc catalog.Descriptor, _ error) {
// Increment the long wait gauge for wait for one version, if this function
// takes longer than the lease duration.
decAfterWait := m.IncGaugeAfterLeaseDuration(GaugeWaitForOneVersion)
defer decAfterWait()
wsTracker := startWaitStatsTracker(ctx)
defer wsTracker.end()
for lastCount, r := 0, retry.Start(retryOpts); r.Next(); {
if err := m.storage.db.KV().Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) {
// Use the lower-level MaybeGetDescriptorByIDUnvalidated to avoid
// performing validation while waiting for leases to drain.
// Validation is somewhat expensive but more importantly, is not
// particularly desirable in this context: there are valid cases where
// descriptors can be removed or made invalid. For instance, the
// descriptor could be a type or a schema which is dropped by a subsequent
// concurrent schema change.
const isDescriptorRequired = false
cr := m.storage.newCatalogReader(ctx)
c, err := cr.GetByIDs(ctx, txn, []descpb.ID{id}, isDescriptorRequired, catalog.Any)
if err != nil {
return err
}
desc = c.LookupDescriptor(id)
if desc == nil {
return errors.Wrapf(catalog.ErrDescriptorNotFound, "waiting for leases to drain on descriptor %d", id)
}
return nil
}); err != nil {
return nil, err
}
// Check to see if there are any leases that still exist on the previous
// version of the descriptor.
now := m.storage.clock.Now()
descs := []IDVersion{NewIDVersionPrev(desc.GetName(), desc.GetID(), desc.GetVersion())}
detail, err := countLeasesWithDetail(ctx, m.storage.db, m.Codec(), regions, m.settings, descs, now, false /*forAnyVersion*/)
if err != nil {
return nil, err
}
if detail.count == 0 {
break
}
if detail.count != lastCount {
lastCount = detail.count
wsTracker.updateProgress(detail)
log.Infof(ctx, "waiting for %d leases to expire: desc=%v", detail.count, descs)
}
}
return desc, nil
}
// IDVersion represents a descriptor ID, version pair that are
// meant to map to a single immutable descriptor.
type IDVersion struct {
// Name is only provided for pretty printing.
Name string
ID descpb.ID
Version descpb.DescriptorVersion
}
// NewIDVersionPrev returns an initialized IDVersion with the
// previous version of the descriptor.
func NewIDVersionPrev(name string, id descpb.ID, currVersion descpb.DescriptorVersion) IDVersion {
return IDVersion{Name: name, ID: id, Version: currVersion - 1}
}
// ensureVersion ensures that the latest version >= minVersion. It will
// check if the latest known version meets the criterion, or attempt to
// acquire a lease at the latest version with the hope that it meets
// the criterion.
func ensureVersion(
ctx context.Context, id descpb.ID, minVersion descpb.DescriptorVersion, m *Manager,
) error {
if s := m.findNewest(id); s != nil && minVersion <= s.GetVersion() {
return nil
}
if err := m.AcquireFreshestFromStore(ctx, id); err != nil {
return err
}
if s := m.findNewest(id); s != nil && s.GetVersion() < minVersion {
return errors.Errorf("version %d for descriptor %s does not exist yet", minVersion, s.GetName())
}
return nil
}
type historicalDescriptor struct {
desc catalog.Descriptor
expiration hlc.Timestamp // ModificationTime of the next descriptor
}
// Retrieves historical descriptors of given id within the lower and upper bound
// timestamp from the MVCC key range in decreasing modification time order. Any
// descriptor versions that were modified in the range [lower, upper) will be
// retrieved through an export request. A lower bound of an empty timestamp
// hlc.Timestamp{} will result in an error.
//
// In the following scenario v4 is our oldest active lease
// [v1@t1 ][v2@t3 ][v3@t5 ][v4@t7
//
// [start end]
//
// getDescriptorsFromStoreForInterval(..., start, end) will get back:
// [v3, v2] (reverse order)
//
// Note that this does not necessarily retrieve a descriptor version that was
// alive at the lower bound timestamp.
func getDescriptorsFromStoreForInterval(
ctx context.Context,
db *kv.DB,
codec keys.SQLCodec,
id descpb.ID,
lowerBound, upperBound hlc.Timestamp,
) ([]historicalDescriptor, error) {
// Ensure lower bound is not an empty timestamp (now).
if lowerBound.IsEmpty() {
return nil, errors.AssertionFailedf(
"getDescriptorsFromStoreForInterval: lower bound cannot be empty")
}
// TODO(ajwerner): We'll want to lift this limitation in order to allow this
// function to find descriptors which could not be found by leasing. This
// will also require some careful managing of expiration timestamps for the
// final descriptor.
if upperBound.IsEmpty() {
return nil, errors.AssertionFailedf(
"getDescriptorsFromStoreForInterval: upper bound cannot be empty")
}
// Create an export request (1 kv call) for all descriptors for given
// descriptor ID written during the interval [timestamp, endTimestamp).
batchRequestHeader := kvpb.Header{
Timestamp: upperBound.Prev(),
ReturnElasticCPUResumeSpans: true,
}
descriptorKey := catalogkeys.MakeDescMetadataKey(codec, id)
// Unmarshal key span retrieved from export request to construct historical descs.
var descriptorsRead []historicalDescriptor
for {
requestHeader := kvpb.RequestHeader{
Key: descriptorKey,
EndKey: descriptorKey.PrefixEnd(),
}
req := &kvpb.ExportRequest{
RequestHeader: requestHeader,
StartTime: lowerBound.Prev(),
MVCCFilter: kvpb.MVCCFilter_All,
}
// Export request returns descriptors in decreasing modification time.
res, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), batchRequestHeader, req)
if pErr != nil {
return nil, errors.Wrapf(pErr.GoError(), "error in retrieving descs between %s, %s",
lowerBound, upperBound)
}
// Keep track of the most recently processed descriptor's modification time to
// set as the expiration for the next descriptor to process. Recall we process
// descriptors in decreasing modification time.
subsequentModificationTime := upperBound
exportResp := res.(*kvpb.ExportResponse)
for _, file := range exportResp.Files {
if err := func() error {
it, err := kvstorage.NewMemSSTIterator(file.SST, false, /* verify */
kvstorage.IterOptions{
// NB: We assume there will be no MVCC range tombstones here.
KeyTypes: kvstorage.IterKeyTypePointsOnly,
LowerBound: keys.MinKey,
UpperBound: keys.MaxKey,
})
if err != nil {
return err
}
defer func() {
if it != nil {
it.Close()
}
}()
// Convert each MVCC key value pair corresponding to the specified
// descriptor ID.
for it.SeekGE(kvstorage.NilKey); ; it.Next() {
if ok, err := it.Valid(); err != nil {
return err
} else if !ok {
// Close and nil out the iter to release the underlying resources.
it.Close()
it = nil
return nil
}
// Decode key and value of descriptor.
k := it.UnsafeKey()
descContent, err := it.UnsafeValue()
if err != nil {
return err
}
if descContent == nil {
return errors.Wrapf(errors.New("unsafe value error"), "error "+
"extracting raw bytes of descriptor with key %s modified between "+
"%s, %s", k.String(), k.Timestamp, subsequentModificationTime)
}
// Construct a plain descriptor.
value := roachpb.Value{RawBytes: descContent, Timestamp: k.Timestamp}
descBuilder, err := descbuilder.FromSerializedValue(&value)
if err != nil {
return err
}
// Construct a historical descriptor with expiration.
histDesc := historicalDescriptor{
desc: descBuilder.BuildImmutable(),
expiration: subsequentModificationTime,
}
descriptorsRead = append(descriptorsRead, histDesc)
// Update the expiration time for next descriptor.
subsequentModificationTime = k.Timestamp
}
}(); err != nil {
return nil, err
}
}
// Check if the ExportRequest paginated with a resume span.
if exportResp.ResumeSpan == nil {
break
}
descriptorKey = exportResp.ResumeSpan.Key
}
return descriptorsRead, nil
}
// Read older descriptor versions for the particular timestamp from store
// through an ExportRequest. The ExportRequest queries the key span for versions
// in range [timestamp, earliest modification time in memory) or [timestamp:] if
// there are no active leases in memory. This is followed by a call to
// getForExpiration in case the ExportRequest doesn't grab the earliest
// descriptor version we are interested in, resulting at most 2 KV calls.
//
// TODO(vivek, james): Future work:
// 1. Translate multiple simultaneous calls to this method into a single call
// as is done for acquireNodeLease().
// 2. Figure out a sane policy on when these descriptors should be purged.
// They are currently purged in PurgeOldVersions.
func (m *Manager) readOlderVersionForTimestamp(
ctx context.Context, id descpb.ID, timestamp hlc.Timestamp,
) ([]historicalDescriptor, error) {
// Retrieve the endTimestamp for our query, which will be the first
// modification timestamp above our query timestamp.
t := m.findDescriptorState(id, false /*create*/)
// A missing descriptor state indicates that this descriptor has been
// purged in the meantime. We should go back around in the acquisition
// loop to make the appropriate error appear.
if t == nil {
return nil, nil
}
endTimestamp, done := func() (hlc.Timestamp, bool) {
t.mu.Lock()
defer t.mu.Unlock()
// If there are no descriptors, then we won't have a valid end timestamp.
if len(t.mu.active.data) == 0 {
return hlc.Timestamp{}, true
}
// We permit gaps in historical versions. We want to find the timestamp
// that represents the start of the validity interval for the known version
// which immediately follows the timestamps we're searching for.
i := sort.Search(len(t.mu.active.data), func(i int) bool {
return timestamp.Less(t.mu.active.data[i].GetModificationTime())
})
// If the timestamp we're searching for is somehow after the last descriptor
// we have in play, then either we have the right descriptor, or some other
// shenanigans where we've evicted the descriptor has occurred.
//
// TODO(ajwerner): When we come to modify this code to allow us to find
// historical descriptors which have been dropped, we'll need to rework
// this case and support providing no upperBound to
// getDescriptorFromStoreForInterval.
if i == len(t.mu.active.data) ||
// If we found a descriptor that isn't the first descriptor, go and check
// whether the descriptor for which we're searching actually exists. This
// will deal with cases where a concurrent fetch filled it in for us.
i > 0 && timestamp.Less(t.mu.active.data[i-1].getExpiration(ctx)) {
return hlc.Timestamp{}, true
}
return t.mu.active.data[i].GetModificationTime(), false
}()
if done {
return nil, nil
}
// Retrieve descriptors in range [timestamp, endTimestamp) in decreasing
// modification time order.
descs, err := getDescriptorsFromStoreForInterval(
ctx, m.storage.db.KV(), m.Codec(), id, timestamp, endTimestamp,
)
if err != nil {
return nil, err
}
// In the case where the descriptor we're looking for is modified before the
// input timestamp, we get the descriptor before the earliest descriptor we
// have from either in memory or from the call to
// getDescriptorsFromStoreForInterval.
var earliestModificationTime hlc.Timestamp
if len(descs) == 0 {
earliestModificationTime = endTimestamp
} else {
earliestModificationTime = descs[len(descs)-1].desc.GetModificationTime()
}
// Unless the timestamp is exactly at the earliest modification time from
// ExportRequest, we'll invoke another call to retrieve the descriptor with
// modification time prior to the timestamp.
if timestamp.Less(earliestModificationTime) {
desc, err := m.storage.getForExpiration(ctx, earliestModificationTime, id)
if err != nil {
return nil, err
}
descs = append(descs, historicalDescriptor{
desc: desc,
expiration: earliestModificationTime,
})
}
return descs, nil
}
// Insert descriptor versions. The versions provided are not in
// any particular order.
func (m *Manager) insertDescriptorVersions(id descpb.ID, versions []historicalDescriptor) {
t := m.findDescriptorState(id, false /* create */)
t.mu.Lock()
defer t.mu.Unlock()
for i := range versions {
// Since we gave up the lock while reading the versions from
// the store we have to ensure that no one else inserted the
// same version.
existingVersion := t.mu.active.findVersion(versions[i].desc.GetVersion())
if existingVersion == nil {
t.mu.active.insert(
newDescriptorVersionState(t, versions[i].desc, versions[i].expiration, nil, nil, false))
}
}
}
// AcquireFreshestFromStore acquires a new lease from the store and
// inserts it into the active set. It guarantees that the lease returned is
// the one acquired after the call is made. Use this if the lease we want to
// get needs to see some descriptor updates that we know happened recently.
func (m *Manager) AcquireFreshestFromStore(ctx context.Context, id descpb.ID) error {
// Create descriptorState if needed.
_ = m.findDescriptorState(id, true /* create */)
// We need to acquire a lease on a "fresh" descriptor, meaning that joining
// a potential in-progress lease acquisition is generally not good enough.
// If we are to join an in-progress acquisition, it needs to be an acquisition
// initiated after this point.
// So, we handle two cases:
// 1. The first acquireNodeLease(..) call tells us that we didn't join an
// in-progress acquisition but rather initiated one. Great, the lease
// that's being acquired is good.
// 2. The first acquireNodeLease(..) call tells us that we did join an
// in-progress acquisition;
// We have to wait this acquisition out; it's not good for us. But any
// future acquisition is good, so the next time around the loop it doesn't
// matter if we initiate a request or join an in-progress one (because
// either way, it's an acquisition performed after this call).
attemptsMade := 0
for {
// Acquire a fresh lease.
didAcquire, err := acquireNodeLease(ctx, m, id, AcquireFreshestBlock)
if err != nil {
return err
}
if didAcquire {
// Case 1: we initiated a lease acquisition call.
break
} else if attemptsMade > 0 {
// Case 2: we joined an in-progress lease acquisition call but that call
// was initiated after we entered this function.
break
}
attemptsMade++
}
return nil
}
// If the lease cannot be obtained because the descriptor is in the process of
// being dropped or offline, the error will be of type inactiveTableError.
// The boolean returned is true if this call was actually responsible for the
// lease acquisition.
func acquireNodeLease(
ctx context.Context, m *Manager, id descpb.ID, typ AcquireType,
) (bool, error) {
start := timeutil.Now()
log.VEventf(ctx, 2, "acquiring lease for descriptor %d...", id)
var toRelease *storedLease
future, didAcquire := m.storage.group.DoChan(ctx,
strconv.Itoa(int(id)),
singleflight.DoOpts{
Stop: m.stopper,
InheritCancelation: false,
},
func(ctx context.Context) (interface{}, error) {
if m.IsDraining() {
return nil, errors.New("cannot acquire lease when draining")
}
newest := m.findNewest(id)
var minExpiration hlc.Timestamp
var lastLease *storedLease
if newest != nil {
minExpiration = newest.getExpiration(ctx)
lastLease = newest.getStoredLease()
}
// A session will be populated within the leasing infrastructure only when
// session based leasing is enabled. This session will be stored both inside
// the leases table and descriptor version states in memory, and can be
// consulted for the expiry depending on the mode
// (see SessionBasedLeasingMode).
var session sqlliveness.Session
if m.sessionBasedLeasingModeAtLeast(ctx, SessionBasedDualWrite) {
var err error
session, err = m.storage.livenessProvider.Session(ctx)
if err != nil {
return false, errors.Wrapf(err, "lease acquisition was unable to resolve liveness session")
}
}
desc, expiration, regionPrefix, err := m.storage.acquire(ctx, minExpiration, session, id, lastLease)
if err != nil {
return nil, err
}
t := m.findDescriptorState(id, false /* create */)
if t == nil {
return nil, errors.AssertionFailedf("could not find descriptor state for id %d", id)
}
t.mu.Lock()
t.mu.takenOffline = false
defer t.mu.Unlock()
var newDescVersionState *descriptorVersionState
newDescVersionState, toRelease, err = t.upsertLeaseLocked(ctx, desc, expiration, session, regionPrefix)
if err != nil {
return nil, err
}
if newDescVersionState != nil {
m.names.insert(ctx, newDescVersionState)
}
if toRelease != nil {
releaseLease(ctx, toRelease, m)
}
return true, nil
})
if m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent != nil {
m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent(typ, id)
}
result := future.WaitForResult(ctx)
if result.Err != nil {
return false, result.Err
}
log.VEventf(ctx, 2, "acquired lease for descriptor %d, took %v", id, timeutil.Since(start))
return didAcquire, nil
}
// releaseLease deletes an entry from system.lease.
func releaseLease(ctx context.Context, lease *storedLease, m *Manager) (released bool) {
// Force the release to happen synchronously, if we are draining or, when we
// force removals for unit tests. This didn't matter with expiration based leases
// since each renewal would have a different expiry (but the same version in
// synthetic scenarios). In the session based model renewals will come in with
// the same session ID, and potentially we can end up racing with inserts and
// deletes on the storage side. For real world scenario, this never happens
// because we release only if a new version exists.
if m.IsDraining() || m.removeOnceDereferenced() {
// Release synchronously to guarantee release before exiting.
// It's possible for the context to get cancelled, so return if
// it was released.
return m.storage.release(ctx, m.stopper, lease)
}
// Release to the store asynchronously, without the descriptorState lock.
newCtx := m.ambientCtx.AnnotateCtx(context.Background())
// AddTags and not WithTags, so that we combine the tags with those
// filled by AnnotateCtx.
newCtx = logtags.AddTags(newCtx, logtags.FromContext(ctx))
if err := m.stopper.RunAsyncTask(
newCtx, "sql.descriptorState: releasing descriptor lease",
func(ctx context.Context) {
m.storage.release(ctx, m.stopper, lease)
}); err != nil {
log.Warningf(ctx, "error: %s, not releasing lease: %q", err, lease)
}
// Asynchronous job is releasing it.
return true
}
// purgeOldVersions removes old unused descriptor versions older than
// minVersion and releases any associated leases.
// If dropped is set, minVersion is ignored; no lease is acquired and all
// existing unused versions are removed. The descriptor is further marked dropped,
// which will cause existing in-use leases to be eagerly released once
// they're not in use any more.
// If t has no active leases, nothing is done.
func purgeOldVersions(
ctx context.Context,
db *kv.DB,
id descpb.ID,
dropped bool,
minVersion descpb.DescriptorVersion,
m *Manager,
) error {
t := m.findDescriptorState(id, false /*create*/)
if t == nil {
return nil
}
empty := func() bool {
t.mu.Lock()
defer t.mu.Unlock()
if t.mu.maxVersionSeen < minVersion {
t.mu.maxVersionSeen = minVersion
}
return len(t.mu.active.data) == 0 && t.mu.acquisitionsInProgress == 0
}()
if empty && !dropped {
// We don't currently have a version on this descriptor, so no need to refresh
// anything.
return nil
}
removeInactives := func(dropped bool) {
leases, leaseToExpire := func() (leasesToRemove []*storedLease, leasesToExpire *descriptorVersionState) {
t.mu.Lock()
defer t.mu.Unlock()
t.mu.takenOffline = dropped
return t.removeInactiveVersions(), t.mu.active.findPreviousToExpire(dropped)
}()
for _, l := range leases {
releaseLease(ctx, l, m)
}
// If there are old versions with an active refcount, we cannot allow
// these to stay forever. So, setup an expiry on them, which is required
// for session based leases.
if leaseToExpire != nil {
func() {
m.mu.Lock()
leaseToExpire.mu.Lock()
defer leaseToExpire.mu.Unlock()
defer m.mu.Unlock()
// In dual-write mode there will already be an expiration set,
// so we don't need to modify it if it's valid.
if leaseToExpire.mu.expiration.Less(m.storage.db.KV().Clock().Now()) {
// Expire any active old versions into the future based on the lease
// duration. If the session lifetime had been longer then use
// that. We will only expire later into the future, then what
// was previously observed, since transactions may have already
// picked this time. If the lease duration is zero, then we are
// looking at instant expiration for testing.
leaseDuration := LeaseDuration.Get(&m.storage.settings.SV)
leaseToExpire.mu.expiration = m.storage.db.KV().Clock().Now().AddDuration(leaseDuration)
if sessionExpiry := leaseToExpire.mu.session.Expiration(); leaseDuration > 0 && leaseToExpire.mu.expiration.Less(sessionExpiry) {
leaseToExpire.mu.expiration = sessionExpiry
}
}
leaseToExpire.mu.session = nil
if leaseToExpire.mu.lease != nil {
m.storage.sessionBasedLeasesWaitingToExpire.Inc(1)
m.mu.leasesToExpire = append(m.mu.leasesToExpire, leaseToExpire)
}
}()
}
}
if dropped {
removeInactives(true /* dropped */)
return nil
}
if err := ensureVersion(ctx, id, minVersion, m); err != nil {
return err
}
// Acquire a refcount on the descriptor on the latest version to maintain an
// active lease, so that it doesn't get released when removeInactives()
// is called below. Release this lease after calling removeInactives().
desc, _, err := t.findForTimestamp(ctx, m.storage.clock.Now())
if isInactive := catalog.HasInactiveDescriptorError(err); err == nil || isInactive {
removeInactives(isInactive)
if desc != nil {
t.release(ctx, desc)
return nil
}
return nil
}
return err
}
// AcquireType is the type of blocking result event when
// calling LeaseAcquireResultBlockEvent.
type AcquireType int
const (
// AcquireBlock denotes the LeaseAcquireResultBlockEvent is
// coming from descriptorState.acquire().
AcquireBlock AcquireType = iota
// AcquireFreshestBlock denotes the LeaseAcquireResultBlockEvent is
// from descriptorState.acquireFreshestFromStore().
AcquireFreshestBlock
// AcquireBackground happens due to periodic background refreshes.
AcquireBackground
)
// Manager manages acquiring and releasing per-descriptor leases. It also
// handles resolving descriptor names to descriptor IDs. The leases are managed
// internally with a descriptor and expiration time exported by the
// API. The descriptor acquired needs to be released. A transaction
// can use a descriptor as long as its timestamp is within the
// validity window for the descriptor:
// descriptor.ModificationTime <= txn.ReadTimestamp < expirationTime
//
// Exported only for testing.
//
// The locking order is:
// Manager.mu > descriptorState.mu > nameCache.mu > descriptorVersionState.mu
type Manager struct {
rangeFeedFactory *rangefeed.Factory
storage storage
settings *cluster.Settings
mu struct {
syncutil.Mutex
// TODO(james): Track size of leased descriptors in memory.
descriptors map[descpb.ID]*descriptorState
// Session based leases that will be removed with expiry, since
// a new version has arrived.
leasesToExpire []*descriptorVersionState
// rangeFeedCheckpoints tracks the health of the range by tracking
// the number of observed checkpoints.
rangeFeedCheckpoints int
// rangeFeedIsUnavailableAt tracks when the range feed first became unavailable
// or when it was last restarted after unavailability.
rangeFeedIsUnavailableAt time.Time
// rangeFeed current range feed on system.descriptors.
rangeFeed *rangefeed.RangeFeed
}
// closeTimeStamp for the range feed, which is the timestamp
// that we have all the updates for.
closeTimestamp atomic.Value
draining atomic.Value
// names is a cache for name -> id mappings. A mapping for the cache
// should only be used if we currently have an active lease on the respective
// id; otherwise, the mapping may well be stale.
// Not protected by mu.
names nameCache
testingKnobs ManagerTestingKnobs
ambientCtx log.AmbientContext
stopper *stop.Stopper
sem *quotapool.IntPool
refreshAllLeases chan struct{}
// descUpdateCh receives updated descriptors from the range feed.
descUpdateCh chan catalog.Descriptor
// descDelCh receives deleted descriptors from the range feed.
descDelCh chan descpb.ID
// rangefeedErrCh receives any terminal errors from the rangefeed.
rangefeedErrCh chan error
}
const leaseConcurrencyLimit = 5
// NewLeaseManager creates a new Manager.
//
// internalExecutor can be nil to help bootstrapping, but then it needs to be set via
// SetInternalExecutor before the Manager is used.
//
// stopper is used to run async tasks. Can be nil in tests.
func NewLeaseManager(
ambientCtx log.AmbientContext,
nodeIDContainer *base.SQLIDContainer,
db isql.DB,
clock *hlc.Clock,
settings *cluster.Settings,
settingsWatcher *settingswatcher.SettingsWatcher,
livenessProvider sqlliveness.Provider,
codec keys.SQLCodec,
testingKnobs ManagerTestingKnobs,
stopper *stop.Stopper,
rangeFeedFactory *rangefeed.Factory,
) *Manager {
lm := &Manager{
storage: storage{
nodeIDContainer: nodeIDContainer,
db: db,
clock: clock,
settings: settings,
codec: codec,
livenessProvider: livenessProvider,
sysDBCache: catkv.NewSystemDatabaseCache(codec, settings),
group: singleflight.NewGroup("acquire-lease", "descriptor ID"),
testingKnobs: testingKnobs.LeaseStoreTestingKnobs,
leasingMetrics: leasingMetrics{
outstandingLeases: metric.NewGauge(metric.Metadata{
Name: "sql.leases.active",
Help: "The number of outstanding SQL schema leases.",
Measurement: "Outstanding leases",
Unit: metric.Unit_COUNT,
}),
sessionBasedLeasesWaitingToExpire: metric.NewGauge(metric.Metadata{
Name: "sql.leases.waiting_to_expire",
Help: "The number of outstanding session based SQL schema leases with expiry.",
Measurement: "Outstanding Leases Waiting to Expire",
Unit: metric.Unit_COUNT,
}),
sessionBasedLeasesExpired: metric.NewGauge(metric.Metadata{
Name: "sql.leases.expired",
Help: "The number of outstanding session based SQL schema leases expired.",
Measurement: "Leases expired because of a new version",
Unit: metric.Unit_COUNT,
}),
longWaitForNoVersionsActive: metric.NewGauge(metric.Metadata{
Name: "sql.leases.long_wait_for_no_version",
Help: "The number of wait for no versions that are taking more than the lease duration.",
Measurement: "Number of wait for long wait for no version routines executing",
Unit: metric.Unit_COUNT,
}),
longWaitForOneVersionsActive: metric.NewGauge(metric.Metadata{
Name: "sql.leases.long_wait_for_one_version",
Help: "The number of wait for one versions that are taking more than the lease duration.",
Measurement: "Number of wait for long wait for one version routines executing",
Unit: metric.Unit_COUNT,
}),
longTwoVersionInvariantViolationWaitActive: metric.NewGauge(metric.Metadata{
Name: "sql.leases.long_wait_for_two_version_invariant",
Help: "The number of two version invariant waits that are taking more than the lease duration.",
Measurement: "Number of two version invariant wait routines executing",
Unit: metric.Unit_COUNT,
}),
},
},
settings: settings,
rangeFeedFactory: rangeFeedFactory,
testingKnobs: testingKnobs,
names: makeNameCache(),
ambientCtx: ambientCtx,
stopper: stopper,
sem: quotapool.NewIntPool("lease manager", leaseConcurrencyLimit),
refreshAllLeases: make(chan struct{}),
}
lm.storage.regionPrefix = &atomic.Value{}
lm.storage.regionPrefix.Store(enum.One)
lm.storage.sessionBasedLeasingMode = lm
lm.storage.writer = newKVWriter(codec, db.KV(), keys.LeaseTableID, settingsWatcher, lm)
lm.stopper.AddCloser(lm.sem.Closer("stopper"))
lm.mu.descriptors = make(map[descpb.ID]*descriptorState)
// We are going to start the range feed later when RefreshLeases
// is invoked inside pre-start. So, that guarantees all range feed events
// that will be generated will be after the current time. So, historical
// queries with in this tenant (i.e. PCR catalog reader) before this point are
// guaranteed to be up to date.
lm.closeTimestamp.Store(db.KV().Clock().Now())
lm.draining.Store(false)
lm.descUpdateCh = make(chan catalog.Descriptor)
lm.descDelCh = make(chan descpb.ID)
lm.rangefeedErrCh = make(chan error)
return lm
}
// NameMatchesDescriptor returns true if the provided name and IDs match this
// descriptor.
func NameMatchesDescriptor(