-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathchangefeed_processors.go
1962 lines (1719 loc) · 65.5 KB
/
changefeed_processors.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.
package changefeedccl
import (
"context"
"fmt"
"math/rand"
"sync"
"time"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcutils"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvevent"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/resolvedspan"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/schemafeed"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/util/cancelchecker"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/logcrash"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/span"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
const EnableCloudBillingAccountingEnvVar = "COCKROACH_ENABLE_CLOUD_BILLING_ACCOUNTING"
var EnableCloudBillingAccounting = envutil.EnvOrDefaultBool(EnableCloudBillingAccountingEnvVar, false)
type changeAggregator struct {
execinfra.ProcessorBase
spec execinfrapb.ChangeAggregatorSpec
memAcc mon.BoundAccount
// checkForNodeDrain is a callback that returns an error
// if this node is being drained.
checkForNodeDrain func() error
// cancel cancels the context passed to all resources created while starting
// this aggregator.
cancel func()
// errCh contains the return values of the kvfeed.
errCh chan error
// kvFeedDoneCh is closed when the kvfeed exits.
kvFeedDoneCh chan struct{}
// sink is the Sink to write rows to. Resolved timestamps are never written
// by changeAggregator.
sink EventSink
// changedRowBuf, if non-nil, contains changed rows to be emitted. Anything
// queued in `resolvedSpanBuf` is dependent on these having been emitted, so
// this one must be empty before moving on to that one.
changedRowBuf *encDatumRowBuffer
// resolvedSpanBuf contains resolved span updates to send to changeFrontier.
// If sink is a bufferSink, it must be emptied before these are sent.
resolvedSpanBuf encDatumRowBuffer
// lastPush records the time when we last pushed data to the coordinator.
lastPush time.Time
// recentKVCount contains the number of emits since the last time a resolved
// span was forwarded to the frontier
recentKVCount uint64
// eventProducer produces the next event from the kv feed.
eventProducer kvevent.Reader
// eventConsumer consumes the event.
eventConsumer eventConsumer
nextHighWaterFlush time.Time // next time high watermark may be flushed.
flushFrequency time.Duration // how often high watermark can be checkpointed.
lastSpanFlush time.Time // last time expensive, span based checkpoint was written.
// frontier keeps track of resolved timestamps for spans along with schema change
// boundary information.
frontier *resolvedspan.AggregatorFrontier
metrics *Metrics
sliMetrics *sliMetrics
sliMetricsID int64
closeTelemetryRecorder func()
knobs TestingKnobs
}
type timestampLowerBoundOracle interface {
inclusiveLowerBoundTS() hlc.Timestamp
}
type changeAggregatorLowerBoundOracle struct {
sf frontier
initialInclusiveLowerBound hlc.Timestamp
}
// inclusiveLowerBoundTs is used to generate a representative timestamp to name
// cloudStorageSink files. This timestamp is either the statement time (in case this
// changefeed job hasn't yet seen any resolved timestamps) or the successor timestamp to
// the local span frontier. This convention is chosen to preserve CDC's ordering
// guarantees. See comment on cloudStorageSink for more details.
func (o *changeAggregatorLowerBoundOracle) inclusiveLowerBoundTS() hlc.Timestamp {
if frontier := o.sf.Frontier(); !frontier.IsEmpty() {
// We call `Next()` here on the frontier because this allows us
// to name files using a timestamp that is an inclusive lower bound
// on the timestamps of the updates contained within the file.
// Files being created at the point this method is called are guaranteed
// to contain row updates with timestamps strictly greater than the local
// span frontier timestamp.
return frontier.Next()
}
// This should only be returned in the case where the changefeed job hasn't yet
// seen a resolved timestamp.
return o.initialInclusiveLowerBound
}
var _ execinfra.Processor = &changeAggregator{}
var _ execinfra.RowSource = &changeAggregator{}
var aggregatorEmitsShutdownCheckpoint = settings.RegisterBoolSetting(
settings.ApplicationLevel,
"changefeed.shutdown_checkpoint.enabled",
"upon shutdown aggregator attempts to emit an up-to-date checkpoint",
false,
)
type drainWatcher <-chan struct{}
func (w drainWatcher) checkForNodeDrain() error {
select {
case <-w:
return changefeedbase.ErrNodeDraining
default:
return nil
}
}
func (w drainWatcher) enabled() bool {
return w != nil
}
func makeDrainWatcher(flowCtx *execinfra.FlowCtx) (w drainWatcher, _ func()) {
if !aggregatorEmitsShutdownCheckpoint.Get(&flowCtx.Cfg.Settings.SV) {
// Drain watcher disabled
return nil, func() {}
}
if cfKnobs, ok := flowCtx.TestingKnobs().Changefeed.(*TestingKnobs); ok && cfKnobs != nil && cfKnobs.OnDrain != nil {
return cfKnobs.OnDrain(), func() {}
}
return flowCtx.Cfg.JobRegistry.OnDrain()
}
func newChangeAggregatorProcessor(
ctx context.Context,
flowCtx *execinfra.FlowCtx,
processorID int32,
spec execinfrapb.ChangeAggregatorSpec,
post *execinfrapb.PostProcessSpec,
) (_ execinfra.Processor, retErr error) {
// Setup monitoring for this node drain.
drainWatcher, drainDone := makeDrainWatcher(flowCtx)
defer func() {
if retErr != nil {
drainDone()
}
}()
memMonitor := execinfra.NewMonitor(ctx, flowCtx.Mon, "changeagg-mem")
ca := &changeAggregator{
spec: spec,
memAcc: memMonitor.MakeBoundAccount(),
checkForNodeDrain: drainWatcher.checkForNodeDrain,
}
if err := ca.Init(
ctx,
ca,
post,
changefeedResultTypes,
flowCtx,
processorID,
memMonitor,
execinfra.ProcStateOpts{
TrailingMetaCallback: func() (producerMeta []execinfrapb.ProducerMetadata) {
defer drainDone()
if drainWatcher.enabled() {
var meta execinfrapb.ChangefeedMeta
if err := drainWatcher.checkForNodeDrain(); err != nil {
// This node is draining. Indicate so in the trailing metadata.
nodeID, _ := flowCtx.Cfg.NodeID.OptionalNodeID()
meta.DrainInfo = &execinfrapb.ChangefeedMeta_DrainInfo{NodeID: nodeID}
}
ca.computeTrailingMetadata(&meta)
producerMeta = []execinfrapb.ProducerMetadata{{Changefeed: &meta}}
}
ca.close()
return producerMeta
},
},
); err != nil {
return nil, err
}
opts := changefeedbase.MakeStatementOptions(ca.spec.Feed.Opts)
// MinCheckpointFrequency controls how frequently the changeAggregator flushes the sink
// and checkpoints the local frontier to changeFrontier. It is used as a rough
// approximation of how latency-sensitive the changefeed user is. For a high latency
// user, such as cloud storage sink where flushes can take much longer, it is often set
// as the sink's flush frequency so as not to negate the sink's batch config.
//
// If a user does not specify a 'min_checkpoint_frequency' duration, we instead default
// to 30s, which is hopefully long enough to account for most possible sink latencies we
// could see without falling too behind.
//
// NB: As long as we periodically get new span-level resolved timestamps
// from the poller (which should always happen, even if the watched data is
// not changing), then this is sufficient and we don't have to do anything
// fancy with timers.
// // TODO(casper): add test for OptMinCheckpointFrequency.
checkpointFreq, err := opts.GetMinCheckpointFrequency()
if err != nil {
return nil, err
}
if checkpointFreq != nil {
ca.flushFrequency = *checkpointFreq
} else {
ca.flushFrequency = changefeedbase.DefaultMinCheckpointFrequency
}
return ca, nil
}
// MustBeStreaming implements the execinfra.Processor interface.
func (ca *changeAggregator) MustBeStreaming() bool {
return true
}
// wrapMetricsRecorderWithTelemetry wraps the supplied metricsRecorder
// so it periodically emits metrics to telemetry.
func (ca *changeAggregator) wrapMetricsRecorderWithTelemetry(
ctx context.Context, recorder metricsRecorder,
) (metricsRecorder, error) {
details := ca.spec.Feed
jobID := ca.spec.JobID
description := ca.spec.Description
// This code exists so that the old behavior is preserved if the spec is created
// in a mixed-version cluster on a node without the new Description field.
// It can be deleted once we no longer need to interoperate with binaries that
// are version 24.3 or earlier.
if description == "" {
// Don't emit telemetry messages for core changefeeds without a description.
if ca.isSinkless() {
return recorder, nil
}
job, err := ca.FlowCtx.Cfg.JobRegistry.LoadJob(ctx, jobID)
if err != nil {
return nil, err
}
description = job.Payload().Description
}
recorderWithTelemetry, err := wrapMetricsRecorderWithTelemetry(ctx, details, description, jobID, ca.FlowCtx.Cfg.Settings, recorder, ca.knobs)
if err != nil {
return ca.sliMetrics, err
}
ca.closeTelemetryRecorder = recorderWithTelemetry.close
return recorderWithTelemetry, nil
}
const (
changeAggregatorLogTag = "change-aggregator"
changeFrontierLogTag = "change-frontier"
)
// Start is part of the RowSource interface.
func (ca *changeAggregator) Start(ctx context.Context) {
// Derive a separate context so that we can shutdown the poller.
ctx, ca.cancel = ca.FlowCtx.Stopper().WithCancelOnQuiesce(ctx)
if ca.spec.JobID != 0 {
ctx = logtags.AddTag(ctx, "job", ca.spec.JobID)
}
ctx = logtags.RemoveTag(ctx, changeFrontierLogTag)
ctx = logtags.AddTag(ctx, changeAggregatorLogTag, nil /* value */)
ctx = ca.StartInternal(ctx, changeAggregatorProcName)
spans, err := ca.setupSpansAndFrontier()
if err != nil {
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error setting up spans and frontier: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
feed := makeChangefeedConfigFromJobDetails(ca.spec.Feed)
opts := feed.Opts
timestampOracle := &changeAggregatorLowerBoundOracle{
sf: ca.frontier,
initialInclusiveLowerBound: feed.ScanTime,
}
if cfKnobs, ok := ca.FlowCtx.TestingKnobs().Changefeed.(*TestingKnobs); ok {
ca.knobs = *cfKnobs
}
// The job registry has a set of metrics used to monitor the various jobs it
// runs. They're all stored as the `metric.Struct` interface because of
// dependency cycles.
ca.metrics = ca.FlowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics)
scope, _ := opts.GetMetricScope()
ca.sliMetrics, err = ca.metrics.getSLIMetrics(scope)
if err != nil {
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error getting sli metrics: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
ca.sliMetricsID = ca.sliMetrics.claimId()
recorder := metricsRecorder(ca.sliMetrics)
recorder, err = ca.wrapMetricsRecorderWithTelemetry(ctx, recorder)
if err != nil {
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error wrapping metrics controller: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
ca.sink, err = getEventSink(ctx, ca.FlowCtx.Cfg, ca.spec.Feed, timestampOracle,
ca.spec.User(), ca.spec.JobID, recorder)
if err != nil {
err = changefeedbase.MarkRetryableError(err)
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error getting sink: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
// This is the correct point to set up certain hooks depending on the sink
// type.
if b, ok := ca.sink.(*bufferSink); ok {
ca.changedRowBuf = &b.buf
}
// If the initial scan was disabled the highwater would've already been forwarded
needsInitialScan := ca.frontier.Frontier().IsEmpty()
// The "HighWater" of the KVFeed is the timestamp it will begin streaming
// change events from. When there's an inital scan, we want the scan to cover
// data up to the StatementTime and change events to begin from that point.
kvFeedHighWater := ca.frontier.Frontier()
if needsInitialScan {
kvFeedHighWater = ca.spec.Feed.StatementTime
}
// TODO(yevgeniy): Introduce separate changefeed monitor that's a parent
// for all changefeeds to control memory allocated to all changefeeds.
pool := ca.FlowCtx.Cfg.BackfillerMonitor
if ca.knobs.MemMonitor != nil {
pool = ca.knobs.MemMonitor
}
limit := changefeedbase.PerChangefeedMemLimit.Get(&ca.FlowCtx.Cfg.Settings.SV)
ca.eventProducer, ca.kvFeedDoneCh, ca.errCh, err = ca.startKVFeed(ctx, spans, kvFeedHighWater, needsInitialScan, feed, pool, limit, opts)
if err != nil {
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error starting kv feed: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
ca.sink = &errorWrapperSink{wrapped: ca.sink}
ca.eventConsumer, ca.sink, err = newEventConsumer(
ctx, ca.FlowCtx.Cfg, ca.spec, feed, ca.frontier, kvFeedHighWater,
ca.sink, ca.metrics, ca.sliMetrics, ca.knobs)
if err != nil {
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error creating event consumer: %v", err)
}
ca.MoveToDraining(err)
ca.cancel()
return
}
// Init heartbeat timer.
ca.lastPush = timeutil.Now()
// Generate expensive checkpoint only after we ran for a while.
ca.lastSpanFlush = timeutil.Now()
}
func (ca *changeAggregator) startKVFeed(
ctx context.Context,
spans []roachpb.Span,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
config ChangefeedConfig,
parentMemMon *mon.BytesMonitor,
memLimit int64,
opts changefeedbase.StatementOptions,
) (kvevent.Reader, chan struct{}, chan error, error) {
cfg := ca.FlowCtx.Cfg
kvFeedMemMon := mon.NewMonitorInheritWithLimit("kvFeed", memLimit, parentMemMon, false /* longLiving */)
kvFeedMemMon.StartNoReserved(ctx, parentMemMon)
buf := kvevent.NewThrottlingBuffer(
kvevent.NewMemBuffer(kvFeedMemMon.MakeBoundAccount(), &cfg.Settings.SV, &ca.metrics.KVFeedMetrics.AggregatorBufferMetricsWithCompat),
cdcutils.NodeLevelThrottler(&cfg.Settings.SV, &ca.metrics.ThrottleMetrics))
// KVFeed takes ownership of the kvevent.Writer portion of the buffer, while
// we return the kvevent.Reader part to the caller.
kvfeedCfg, err := ca.makeKVFeedCfg(ctx, config, spans, buf, initialHighWater, needsInitialScan, kvFeedMemMon, opts)
if err != nil {
return nil, nil, nil, err
}
errCh := make(chan error, 1)
doneCh := make(chan struct{})
// If RunAsyncTask immediately returns an error, the kvfeed was not run and
// will not run.
if err := ca.FlowCtx.Stopper().RunAsyncTask(ctx, "changefeed-poller", func(ctx context.Context) {
defer close(doneCh)
defer kvFeedMemMon.Stop(ctx)
errCh <- kvfeed.Run(ctx, kvfeedCfg)
}); err != nil {
// Ensure that the memory monitor is closed properly.
kvFeedMemMon.Stop(ctx)
return nil, nil, nil, err
}
return buf, doneCh, errCh, nil
}
func (ca *changeAggregator) checkKVFeedErr() error {
select {
case err := <-ca.errCh:
return err
default:
return nil
}
}
func (ca *changeAggregator) makeKVFeedCfg(
ctx context.Context,
config ChangefeedConfig,
spans []roachpb.Span,
buf kvevent.Writer,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
memMon *mon.BytesMonitor,
opts changefeedbase.StatementOptions,
) (kvfeed.Config, error) {
schemaChange, err := config.Opts.GetSchemaChangeHandlingOptions()
if err != nil {
return kvfeed.Config{}, err
}
filters := config.Opts.GetFilters()
cfg := ca.FlowCtx.Cfg
initialScanOnly := config.EndTime == initialHighWater
var sf schemafeed.SchemaFeed
if schemaChange.Policy == changefeedbase.OptSchemaChangePolicyIgnore || initialScanOnly {
sf = schemafeed.DoNothingSchemaFeed
} else {
sf = schemafeed.New(ctx, cfg, schemaChange.EventClass, AllTargets(ca.spec.Feed),
initialHighWater, &ca.metrics.SchemaFeedMetrics, config.Opts.GetCanHandle())
}
monitoringCfg, err := makeKVFeedMonitoringCfg(ctx, ca.sliMetrics, opts, ca.FlowCtx.Cfg.Settings)
if err != nil {
return kvfeed.Config{}, err
}
return kvfeed.Config{
Writer: buf,
Settings: cfg.Settings,
DB: cfg.DB.KV(),
Codec: cfg.Codec,
Clock: cfg.DB.KV().Clock(),
Spans: spans,
CheckpointSpans: ca.spec.Checkpoint.Spans,
CheckpointTimestamp: ca.spec.Checkpoint.Timestamp,
Targets: AllTargets(ca.spec.Feed),
Metrics: &ca.metrics.KVFeedMetrics,
MM: memMon,
InitialHighWater: initialHighWater,
EndTime: config.EndTime,
WithDiff: filters.WithDiff,
WithFiltering: filters.WithFiltering,
WithFrontierQuantize: changefeedbase.Quantize.Get(&cfg.Settings.SV),
NeedsInitialScan: needsInitialScan,
SchemaChangeEvents: schemaChange.EventClass,
SchemaChangePolicy: schemaChange.Policy,
SchemaFeed: sf,
Knobs: ca.knobs.FeedKnobs,
ScopedTimers: ca.sliMetrics.Timers,
MonitoringCfg: monitoringCfg,
ConsumerID: int64(ca.spec.JobID),
}, nil
}
func makeKVFeedMonitoringCfg(
ctx context.Context,
sliMetrics *sliMetrics,
opts changefeedbase.StatementOptions,
settings *cluster.Settings,
) (kvfeed.MonitoringConfig, error) {
laggingRangesThreshold, laggingRangesInterval, err := opts.GetLaggingRangesConfig(ctx, settings)
if err != nil {
return kvfeed.MonitoringConfig{}, err
}
return kvfeed.MonitoringConfig{
LaggingRangesCallback: sliMetrics.getLaggingRangesCallback(),
LaggingRangesThreshold: laggingRangesThreshold,
LaggingRangesPollingInterval: laggingRangesInterval,
OnBackfillCallback: sliMetrics.getBackfillCallback(),
OnBackfillRangeCallback: sliMetrics.getBackfillRangeCallback(),
}, nil
}
// setupSpans is called on start to extract the spans for this changefeed as a
// slice and creates a span frontier with the initial resolved timestamps. This
// SpanFrontier only tracks the spans being watched on this node. There is a
// different SpanFrontier elsewhere for the entire changefeed. This object is
// used to filter out some previously emitted rows, and by the cloudStorageSink
// to name its output files in lexicographically monotonic fashion.
func (ca *changeAggregator) setupSpansAndFrontier() (spans []roachpb.Span, err error) {
var initialHighWater hlc.Timestamp
spans = make([]roachpb.Span, 0, len(ca.spec.Watches))
// Keep initialHighWater as the minimum of all InitialResolved timestamps.
// If there are any zero InitialResolved timestamps, initial scan is
// ongoing. If there are no zero InitialResolved timestamps, initial scan
// is not required.
for i, watch := range ca.spec.Watches {
spans = append(spans, watch.Span)
if i == 0 {
initialHighWater = watch.InitialResolved
continue
}
if watch.InitialResolved.Less(initialHighWater) {
initialHighWater = watch.InitialResolved
}
}
ca.frontier, err = resolvedspan.NewAggregatorFrontier(ca.spec.Feed.StatementTime, initialHighWater, spans...)
if err != nil {
return nil, err
}
checkpointedSpanTs := ca.spec.Checkpoint.Timestamp
// Checkpoint records from 21.2 were used only for backfills and did not store
// the timestamp, since in a backfill it must either be the StatementTime for
// an initial backfill, or right after the high-water for schema backfills.
if checkpointedSpanTs.IsEmpty() {
if initialHighWater.IsEmpty() {
checkpointedSpanTs = ca.spec.Feed.StatementTime
} else {
checkpointedSpanTs = initialHighWater.Next()
}
}
// Checkpointed spans are spans that were above the highwater mark, and we
// must preserve that information in the frontier for future checkpointing.
for _, checkpointedSpan := range ca.spec.Checkpoint.Spans {
if _, err := ca.frontier.Forward(checkpointedSpan, checkpointedSpanTs); err != nil {
return nil, err
}
}
return spans, nil
}
// close has two purposes: to synchronize on the completion of the helper
// goroutines created by the Start method, and to clean up any resources used by
// the processor.
//
// Due to the fact that this method may be called even if the processor did not
// finish completion, there is an excessive amount of nil checking. For example,
// (*changeAggregator) Start() may encounter an error and move the processor to
// draining before one of the fields below (ex. ca.drainDone) is set.
func (ca *changeAggregator) close() {
if ca.Closed {
return
}
if ca.cancel != nil {
// consumer close may be called even before Start is called. If that's
// the case, cancel is not initialized. We still need to perform the
// remainder of the cleanup though.
ca.cancel()
}
// Wait for the poller to finish shutting down if it was started.
if ca.kvFeedDoneCh != nil {
<-ca.kvFeedDoneCh
}
if ca.eventConsumer != nil {
_ = ca.eventConsumer.Close() // context cancellation expected here.
}
if ca.closeTelemetryRecorder != nil {
ca.closeTelemetryRecorder()
}
if ca.sink != nil {
// Best effort: context is often cancel by now, so we expect to see an error
_ = ca.sink.Close()
}
// The sliMetrics registry may hold on to some state for each aggregator
// (ex. last known resolved timestamp). De-register the aggregator so this
// data is deleted and not included in metrics.
if ca.sliMetrics != nil {
ca.sliMetrics.closeId(ca.sliMetricsID)
}
ca.memAcc.Close(ca.Ctx())
ca.MemMonitor.Stop(ca.Ctx())
ca.InternalClose()
}
var aggregatorHeartbeatFrequency = settings.RegisterDurationSetting(
settings.ApplicationLevel,
"changefeed.aggregator.heartbeat",
"changefeed aggregator will emit a heartbeat message to the coordinator with this frequency; 0 disables. "+
"The setting value should be <=1/2 of server.shutdown.jobs.timeout period",
4*time.Second,
settings.NonNegativeDuration,
)
var aggregatorFlushJitter = settings.RegisterFloatSetting(
settings.ApplicationLevel,
"changefeed.aggregator.flush_jitter",
"jitter aggregator flushes as a fraction of min_checkpoint_frequency. This "+
"setting has no effect if min_checkpoint_frequency is set to 0.",
0.1, /* 10% */
settings.NonNegativeFloat,
settings.WithPublic,
)
func nextFlushWithJitter(s timeutil.TimeSource, d time.Duration, j float64) (time.Time, error) {
if j < 0 || d < 0 {
return s.Now(), errors.AssertionFailedf("invalid jitter value: %f, duration: %s", j, d)
}
if j == 0 || d == 0 {
return s.Now().Add(d), nil
}
nextFlush := d + time.Duration(rand.Int63n(int64(j*float64(d))))
return s.Now().Add(nextFlush), nil
}
// Next is part of the RowSource interface.
func (ca *changeAggregator) Next() (rowenc.EncDatumRow, *execinfrapb.ProducerMetadata) {
shouldEmitHeartBeat := func() bool {
freq := aggregatorHeartbeatFrequency.Get(&ca.FlowCtx.Cfg.Settings.SV)
return freq > 0 && timeutil.Since(ca.lastPush) > freq
}
for ca.State == execinfra.StateRunning {
if !ca.changedRowBuf.IsEmpty() {
ca.lastPush = timeutil.Now()
return ca.ProcessRowHelper(ca.changedRowBuf.Pop()), nil
} else if !ca.resolvedSpanBuf.IsEmpty() {
ca.lastPush = timeutil.Now()
return ca.ProcessRowHelper(ca.resolvedSpanBuf.Pop()), nil
} else if shouldEmitHeartBeat() {
// heartbeat is simply an attempt to push a row into process row helper.
// This mechanism allows coordinator to propagate shutdown information to
// all aggregators -- that is, inability to write to this channel will
// trigger aggregator to transition away from StateRunning.
ca.lastPush = timeutil.Now()
return nil, &execinfrapb.ProducerMetadata{
Changefeed: &execinfrapb.ChangefeedMeta{Heartbeat: true},
}
}
// As the last gasp before shutdown, transmit an up-to-date frontier
// information to the coordinator. We expect to get this signal via the
// polling below before the drain actually occurs and starts tearing
// things down.
if err := ca.checkForNodeDrain(); err != nil {
// NB: we do not invoke ca.cancel here -- just merely moving
// to drain state so that the trailing metadata callback
// has a chance to produce shutdown checkpoint.
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error while checking for node drain: %v", err)
}
ca.MoveToDraining(err)
break
}
if err := ca.tick(); err != nil {
var e kvevent.ErrBufferClosed
if errors.As(err, &e) {
// ErrBufferClosed is a signal that our kvfeed has exited expectedly.
err = e.Unwrap()
if errors.Is(err, kvevent.ErrNormalRestartReason) {
err = nil
}
} else {
// If the poller errored first, that's the
// interesting one, so overwrite `err`.
if kvFeedErr := ca.checkKVFeedErr(); kvFeedErr != nil {
err = kvFeedErr
}
}
// Shut down the poller if it wasn't already.
ca.cancel()
if log.V(2) {
log.Infof(ca.Ctx(), "change aggregator moving to draining due to error from tick: %v", err)
}
ca.MoveToDraining(err)
break
}
}
return nil, ca.DrainHelper()
}
func (ca *changeAggregator) computeTrailingMetadata(meta *execinfrapb.ChangefeedMeta) {
if ca.eventConsumer == nil || ca.sink == nil {
// Shutdown may be called even if Start didn't succeed.
return
}
// Before emitting trailing metadata, we must flush any buffered events.
// Note: we are not flushing KV feed -- blocking buffer may still have buffered
// elements; but we are not interested in flushing potentially large number of events;
// all we want to ensure is that any previously observed event (such as resolved timestamp)
// has been fully processed.
if err := ca.flushBufferedEvents(); err != nil {
// This method may be invoked during shutdown when the context already canceled.
// Regardless for the cause of this error, there is nothing we can do with it anyway.
// All we want to ensure is that if any error occurs we still return correct checkpoint,
// which in this case is nothing.
return
}
// Build out the list of frontier spans.
ca.frontier.Entries(func(r roachpb.Span, ts hlc.Timestamp) (done span.OpResult) {
meta.Checkpoint = append(meta.Checkpoint,
execinfrapb.ChangefeedMeta_FrontierSpan{
Span: r,
Timestamp: ts,
})
return span.ContinueMatch
})
}
// tick is the workhorse behind Next(). It retrieves the next event from
// kvFeed, sends off this event to the event consumer, and flushes the sink
// if necessary.
func (ca *changeAggregator) tick() error {
event, err := ca.eventProducer.Get(ca.Ctx())
if err != nil {
return err
}
queuedNanos := timeutil.Since(event.BufferAddTimestamp()).Nanoseconds()
ca.metrics.QueueTimeNanos.Inc(queuedNanos)
switch event.Type() {
case kvevent.TypeKV:
// Keep track of SLI latency for non-backfill/rangefeed KV events.
if event.BackfillTimestamp().IsEmpty() {
ca.sliMetrics.AdmitLatency.RecordValue(timeutil.Since(event.Timestamp().GoTime()).Nanoseconds())
}
ca.recentKVCount++
return ca.eventConsumer.ConsumeEvent(ca.Ctx(), event)
case kvevent.TypeResolved:
a := event.DetachAlloc()
a.Release(ca.Ctx())
resolved := event.Resolved()
if ca.knobs.FilterSpanWithMutation != nil {
shouldFilter, err := ca.knobs.FilterSpanWithMutation(&resolved)
if err != nil {
return err
}
if shouldFilter {
return nil
}
}
return ca.noteResolvedSpan(resolved)
case kvevent.TypeFlush:
return ca.flushBufferedEvents()
}
return nil
}
func (ca *changeAggregator) flushBufferedEvents() error {
if err := ca.eventConsumer.Flush(ca.Ctx()); err != nil {
return err
}
return ca.sink.Flush(ca.Ctx())
}
// noteResolvedSpan periodically flushes Frontier progress from the current
// changeAggregator node to the changeFrontier node to allow the changeFrontier
// to persist the overall changefeed's progress
func (ca *changeAggregator) noteResolvedSpan(resolved jobspb.ResolvedSpan) (returnErr error) {
if resolved.Timestamp.IsEmpty() {
// @0.0 resolved timestamps could come in from rangefeed checkpoint.
// When rangefeed starts running, it emits @0.0 resolved timestamp.
// We don't care about those as far as checkpointing concerned.
return nil
}
advanced, err := ca.frontier.ForwardResolvedSpan(ca.Ctx(), resolved)
if err != nil {
return err
}
sv := &ca.FlowCtx.Cfg.Settings.SV
defer func(ctx context.Context) {
maybeLogBehindSpan(ctx, "aggregator", ca.frontier, advanced, sv)
}(ca.Ctx())
// The resolved sliMetric data backs the aggregator_progress metric
if advanced {
ca.sliMetrics.setResolved(ca.sliMetricsID, ca.frontier.Frontier())
}
forceFlush := resolved.BoundaryType != jobspb.ResolvedSpan_NONE
// NB: if we miss flush window, and the flush frequency is fairly high (minutes),
// it might be a while before frontier advances again (particularly if
// the number of ranges and closed timestamp settings are high).
// TODO(yevgeniy): Consider doing something similar to how job checkpointing
// works in the frontier where if we missed the window to checkpoint, we will attempt
// the checkpoint at the next opportune moment.
checkpointFrontier := advanced &&
(forceFlush || timeutil.Now().After(ca.nextHighWaterFlush))
if checkpointFrontier {
defer func() {
ca.nextHighWaterFlush, err = nextFlushWithJitter(
timeutil.DefaultTimeSource{}, ca.flushFrequency, aggregatorFlushJitter.Get(sv))
if err != nil {
returnErr = errors.CombineErrors(returnErr, err)
}
}()
return ca.flushFrontier()
}
// At a lower frequency, we checkpoint specific spans in the job progress
// either in backfills or if the highwater mark is excessively lagging behind.
checkpointSpans := ca.spec.JobID != 0 && /* enterprise changefeed */
(ca.frontier.InBackfill(resolved) || ca.frontier.HasLaggingSpans(sv)) &&
canCheckpointSpans(sv, ca.lastSpanFlush)
if checkpointSpans {
defer func() {
ca.lastSpanFlush = timeutil.Now()
}()
return ca.flushFrontier()
}
return returnErr
}
// flushFrontier flushes sink and emits resolved spans to the change frontier.
func (ca *changeAggregator) flushFrontier() error {
// Make sure to flush the sink before forwarding resolved spans,
// otherwise, we could lose buffered messages and violate the
// at-least-once guarantee. This is also true for checkpointing the
// resolved spans in the job progress.
if err := ca.flushBufferedEvents(); err != nil {
return err
}
// Iterate frontier spans and build a list of spans to emit.
var batch jobspb.ResolvedSpans
ca.frontier.EntriesWithBoundaryType(func(s roachpb.Span, ts hlc.Timestamp, boundaryType jobspb.ResolvedSpan_BoundaryType) (done span.OpResult) {
batch.ResolvedSpans = append(batch.ResolvedSpans, jobspb.ResolvedSpan{
Span: s,
Timestamp: ts,
BoundaryType: boundaryType,
})
return span.ContinueMatch
})
return ca.emitResolved(batch)
}
func (ca *changeAggregator) emitResolved(batch jobspb.ResolvedSpans) error {
progressUpdate := jobspb.ResolvedSpans{
ResolvedSpans: batch.ResolvedSpans,
Stats: jobspb.ResolvedSpans_Stats{
RecentKvCount: ca.recentKVCount,
},
}
updateBytes, err := protoutil.Marshal(&progressUpdate)
if err != nil {
return err
}
ca.resolvedSpanBuf.Push(rowenc.EncDatumRow{
rowenc.EncDatum{Datum: tree.NewDBytes(tree.DBytes(updateBytes))},
rowenc.EncDatum{Datum: tree.DNull}, // topic
rowenc.EncDatum{Datum: tree.DNull}, // key
rowenc.EncDatum{Datum: tree.DNull}, // value
})
ca.metrics.ResolvedMessages.Inc(1)
ca.recentKVCount = 0
return nil
}
// ConsumerClosed is part of the RowSource interface.
func (ca *changeAggregator) ConsumerClosed() {
// The consumer is done, Next() will not be called again.
ca.close()
}
func (ca *changeAggregator) isSinkless() bool {
return ca.spec.JobID == 0
}
const (
emitAllResolved = 0
emitNoResolved = -1
)
type changeFrontier struct {
execinfra.ProcessorBase
evalCtx *eval.Context
spec execinfrapb.ChangeFrontierSpec
memAcc mon.BoundAccount
a tree.DatumAlloc
// input returns rows from one or more changeAggregator processors
input execinfra.RowSource
// frontier contains the current resolved timestamp high-water for the tracked
// span set.
frontier *resolvedspan.CoordinatorFrontier
// localState contains an in memory cache of progress updates.
// Used by core style changefeeds as well as regular changefeeds to make
// restarts more efficient with respects to duplicates.
// localState reflects an up-to-date information *after* the checkpoint.
localState *cachedState
// encoder is the Encoder to use for resolved timestamp serialization.
encoder Encoder
// sink is the Sink to write resolved timestamps to. Rows are never written
// by changeFrontier.
sink ResolvedTimestampSink
// freqEmitResolved, if >= 0, is a lower bound on the duration between
// resolved timestamp emits.
freqEmitResolved time.Duration
// lastEmitResolved is the last time a resolved timestamp was emitted.
lastEmitResolved time.Time
// latestResolvedKV indicates the last time the frontier received a resolved
// span from an aggregator that has recently emitted kv events.
latestResolvedKV time.Time
// lastProtectedTimestampUpdate is the last time the protected timestamp
// record was updated to the frontier's highwater mark
lastProtectedTimestampUpdate time.Time