diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
index 9f11b0215de1..8de6a8f52915 100644
--- a/docs/generated/settings/settings-for-tenants.txt
+++ b/docs/generated/settings/settings-for-tenants.txt
@@ -10,6 +10,10 @@ bulkio.backup.file_size byte size 128 MiB target size for individual data files
bulkio.backup.read_timeout duration 5m0s amount of time after which a read attempt is considered timed out, which causes the backup to fail
bulkio.backup.read_with_priority_after duration 1m0s amount of time since the read-as-of time above which a BACKUP should use priority when retrying reads
bulkio.stream_ingestion.minimum_flush_interval duration 5s the minimum timestamp between flushes; flushes may still occur if internal buffers fill up
+changefeed.balance_range_distribution.enable boolean false if enabled, the ranges are balanced equally among all nodes
+changefeed.event_consumer_worker_queue_size integer 16 if changefeed.event_consumer_workers is enabled, this setting sets the maxmimum number of eventswhich a worker can buffer
+changefeed.event_consumer_workers integer 0 the number of workers to use when processing events: <0 disables, 0 assigns a reasonable default, >0 assigns the setting value
+changefeed.fast_gzip.enabled boolean true use fast gzip implementation
changefeed.node_throttle_config string specifies node level throttling configuration for all changefeeeds
changefeed.schema_feed.read_with_priority_after duration 1m0s retry with high priority if we were not able to read descriptors for too long; 0 disables
cloudstorage.azure.concurrent_upload_buffers integer 1 controls the number of concurrent buffers that will be used by the Azure client when uploading chunks.Each buffer can buffer up to cloudstorage.write_chunk.size of memory during an upload
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
index 0ddb27c443f9..2d21f8281bfe 100644
--- a/docs/generated/settings/settings.html
+++ b/docs/generated/settings/settings.html
@@ -16,6 +16,10 @@
bulkio.backup.read_timeout | duration | 5m0s | amount of time after which a read attempt is considered timed out, which causes the backup to fail |
bulkio.backup.read_with_priority_after | duration | 1m0s | amount of time since the read-as-of time above which a BACKUP should use priority when retrying reads |
bulkio.stream_ingestion.minimum_flush_interval | duration | 5s | the minimum timestamp between flushes; flushes may still occur if internal buffers fill up |
+changefeed.balance_range_distribution.enable | boolean | false | if enabled, the ranges are balanced equally among all nodes |
+changefeed.event_consumer_worker_queue_size | integer | 16 | if changefeed.event_consumer_workers is enabled, this setting sets the maxmimum number of eventswhich a worker can buffer |
+changefeed.event_consumer_workers | integer | 0 | the number of workers to use when processing events: <0 disables, 0 assigns a reasonable default, >0 assigns the setting value |
+changefeed.fast_gzip.enabled | boolean | true | use fast gzip implementation |
changefeed.node_throttle_config | string |
| specifies node level throttling configuration for all changefeeeds |
changefeed.schema_feed.read_with_priority_after | duration | 1m0s | retry with high priority if we were not able to read descriptors for too long; 0 disables |
cloudstorage.azure.concurrent_upload_buffers | integer | 1 | controls the number of concurrent buffers that will be used by the Azure client when uploading chunks.Each buffer can buffer up to cloudstorage.write_chunk.size of memory during an upload |
diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel
index 3b46a7d268b2..d4944daca34c 100644
--- a/pkg/BUILD.bazel
+++ b/pkg/BUILD.bazel
@@ -588,6 +588,7 @@ ALL_TESTS = [
"//pkg/util/span:span_test",
"//pkg/util/stop:stop_test",
"//pkg/util/stringarena:stringarena_test",
+ "//pkg/util/strutil:strutil_test",
"//pkg/util/syncutil/singleflight:singleflight_test",
"//pkg/util/syncutil:syncutil_test",
"//pkg/util/sysutil:sysutil_test",
@@ -2016,6 +2017,8 @@ GO_TARGETS = [
"//pkg/util/stringarena:stringarena",
"//pkg/util/stringarena:stringarena_test",
"//pkg/util/stringencoding:stringencoding",
+ "//pkg/util/strutil:strutil",
+ "//pkg/util/strutil:strutil_test",
"//pkg/util/syncutil/singleflight:singleflight",
"//pkg/util/syncutil/singleflight:singleflight_test",
"//pkg/util/syncutil:syncutil",
@@ -2969,6 +2972,7 @@ GET_X_DATA_TARGETS = [
"//pkg/util/stop:get_x_data",
"//pkg/util/stringarena:get_x_data",
"//pkg/util/stringencoding:get_x_data",
+ "//pkg/util/strutil:get_x_data",
"//pkg/util/syncutil:get_x_data",
"//pkg/util/syncutil/singleflight:get_x_data",
"//pkg/util/system:get_x_data",
diff --git a/pkg/ccl/changefeedccl/BUILD.bazel b/pkg/ccl/changefeedccl/BUILD.bazel
index 66e3f4a84ae5..86e691fdce26 100644
--- a/pkg/ccl/changefeedccl/BUILD.bazel
+++ b/pkg/ccl/changefeedccl/BUILD.bazel
@@ -10,6 +10,7 @@ go_library(
"changefeed_dist.go",
"changefeed_processors.go",
"changefeed_stmt.go",
+ "compression.go",
"doc.go",
"encoder.go",
"encoder_avro.go",
@@ -58,6 +59,7 @@ go_library(
"//pkg/jobs/jobsprotectedts",
"//pkg/keys",
"//pkg/kv",
+ "//pkg/kv/kvclient/kvcoord",
"//pkg/kv/kvserver",
"//pkg/kv/kvserver/closedts",
"//pkg/kv/kvserver/protectedts",
@@ -96,6 +98,7 @@ go_library(
"//pkg/sql/sessiondatapb",
"//pkg/sql/sqlutil",
"//pkg/sql/types",
+ "//pkg/util",
"//pkg/util/bitarray",
"//pkg/util/bufalloc",
"//pkg/util/cache",
@@ -227,6 +230,7 @@ go_test(
"//pkg/sql/parser",
"//pkg/sql/randgen",
"//pkg/sql/rowenc",
+ "//pkg/sql/rowenc/keyside",
"//pkg/sql/sem/eval",
"//pkg/sql/sem/tree",
"//pkg/sql/sem/volatility",
diff --git a/pkg/ccl/changefeedccl/avro.go b/pkg/ccl/changefeedccl/avro.go
index 23059ff791e5..e6a81150daeb 100644
--- a/pkg/ccl/changefeedccl/avro.go
+++ b/pkg/ccl/changefeedccl/avro.go
@@ -607,6 +607,7 @@ func typeToAvroSchema(typ *types.T) (*avroSchemaField, error) {
},
func(d tree.Datum, memo interface{}) (interface{}, error) {
datumArr := d.(*tree.DArray)
+
var avroArr []interface{}
if memo != nil {
avroArr = memo.([]interface{})
@@ -616,14 +617,13 @@ func typeToAvroSchema(typ *types.T) (*avroSchemaField, error) {
} else {
avroArr = make([]interface{}, 0, datumArr.Len())
}
-
for i, elt := range datumArr.Array {
var encoded interface{}
if elt == tree.DNull {
encoded = nil
} else {
var encErr error
- if i < len(avroArr) {
+ if i < len(avroArr) && avroArr[i] != nil {
encoded, encErr = itemSchema.encodeDatum(elt, avroArr[i].(map[string]interface{})[itemUnionKey])
} else {
encoded, encErr = itemSchema.encodeDatum(elt, nil)
diff --git a/pkg/ccl/changefeedccl/bench_test.go b/pkg/ccl/changefeedccl/bench_test.go
index 08a7f06d1d33..1d4d3916b025 100644
--- a/pkg/ccl/changefeedccl/bench_test.go
+++ b/pkg/ccl/changefeedccl/bench_test.go
@@ -210,7 +210,7 @@ func createBenchmarkChangefeed(
}
initialHighWater := hlc.Timestamp{}
encodingOpts := changefeedbase.EncodingOptions{Format: changefeedbase.OptFormatJSON, Envelope: changefeedbase.OptEnvelopeRow}
- encoder, err := makeJSONEncoder(encodingOpts, AllTargets(details))
+ encoder, err := makeJSONEncoder(encodingOpts)
if err != nil {
return nil, nil, err
}
diff --git a/pkg/ccl/changefeedccl/cdcevent/BUILD.bazel b/pkg/ccl/changefeedccl/cdcevent/BUILD.bazel
index c851d9b62c88..ca9c8ea23c10 100644
--- a/pkg/ccl/changefeedccl/cdcevent/BUILD.bazel
+++ b/pkg/ccl/changefeedccl/cdcevent/BUILD.bazel
@@ -8,6 +8,7 @@ go_library(
"event.go",
"projection.go",
"rowfetcher_cache.go",
+ "version_cache.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcevent",
visibility = ["//visibility:public"],
diff --git a/pkg/ccl/changefeedccl/cdcevent/event.go b/pkg/ccl/changefeedccl/cdcevent/event.go
index 41123760f6a7..9534584a8ea7 100644
--- a/pkg/ccl/changefeedccl/cdcevent/event.go
+++ b/pkg/ccl/changefeedccl/cdcevent/event.go
@@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/row"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/cache"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/iterutil"
@@ -345,7 +346,7 @@ func getEventDescriptorCached(
schemaTS hlc.Timestamp,
cache *cache.UnorderedCache,
) (*EventDescriptor, error) {
- idVer := idVersion{id: desc.GetID(), version: desc.GetVersion(), family: family.ID}
+ idVer := CacheKey{ID: desc.GetID(), Version: desc.GetVersion(), FamilyID: family.ID}
if v, ok := cache.Get(idVer); ok {
ed := v.(*EventDescriptor)
@@ -381,7 +382,7 @@ func NewEventDecoder(
return nil, err
}
- eventDescriptorCache := cache.NewUnorderedCache(defaultCacheConfig)
+ eventDescriptorCache := cache.NewUnorderedCache(DefaultCacheConfig)
getEventDescriptor := func(
desc catalog.TableDescriptor,
family *descpb.ColumnFamilyDescriptor,
@@ -549,3 +550,57 @@ func TestingGetFamilyIDFromKey(
_, familyID, err := decoder.(*eventDecoder).rfCache.tableDescForKey(context.Background(), key, ts)
return familyID, err
}
+
+// TestingMakeEventRowFromEncDatums creates event row from specified enc datum row.
+// encRow assumed to contain *already decoded* datums.
+// The first numKeyCols are assumed to be primary key columns.
+// Columns names are generated based on the datum types.
+func TestingMakeEventRowFromEncDatums(
+ encRow rowenc.EncDatumRow, colTypes []*types.T, numKeyCols int, deleted bool,
+) Row {
+ if len(encRow) != len(colTypes) {
+ panic("unexpected length mismatch")
+ }
+ intRange := func(start, end int) (res []int) {
+ for i := 0; i < end; i++ {
+ res = append(res, i)
+ }
+ return res
+ }
+ ed := &EventDescriptor{
+ Metadata: Metadata{
+ TableID: 42,
+ TableName: "randtbl",
+ FamilyName: "primary",
+ },
+ cols: func() (cols []ResultColumn) {
+ names := make(map[string]int, len(encRow))
+ for i, typ := range colTypes {
+ colName := fmt.Sprintf("col_%s", typ.String())
+ names[colName]++
+ if names[colName] > 1 {
+ colName += fmt.Sprintf("_%d", names[colName]-1)
+ }
+ cols = append(cols, ResultColumn{
+ ResultColumn: colinfo.ResultColumn{
+ Name: colName,
+ Typ: typ,
+ TableID: 42,
+ },
+ ord: i,
+ })
+ }
+ return cols
+ }(),
+ keyCols: intRange(0, numKeyCols),
+ valueCols: intRange(0, len(encRow)),
+ }
+
+ var alloc tree.DatumAlloc
+ return Row{
+ EventDescriptor: ed,
+ datums: encRow,
+ deleted: deleted,
+ alloc: &alloc,
+ }
+}
diff --git a/pkg/ccl/changefeedccl/cdcevent/rowfetcher_cache.go b/pkg/ccl/changefeedccl/cdcevent/rowfetcher_cache.go
index 3f5523eeadb8..7d29b5af9c91 100644
--- a/pkg/ccl/changefeedccl/cdcevent/rowfetcher_cache.go
+++ b/pkg/ccl/changefeedccl/cdcevent/rowfetcher_cache.go
@@ -57,20 +57,6 @@ type watchedFamily struct {
familyName string
}
-var defaultCacheConfig = cache.Config{
- Policy: cache.CacheFIFO,
- // TODO: If we find ourselves thrashing here in changefeeds on many tables,
- // we can improve performance by eagerly evicting versions using Resolved notifications.
- // A old version with a timestamp entirely before a notification can be safely evicted.
- ShouldEvict: func(size int, _ interface{}, _ interface{}) bool { return size > 1024 },
-}
-
-type idVersion struct {
- id descpb.ID
- version descpb.DescriptorVersion
- family descpb.FamilyID
-}
-
// newRowFetcherCache constructs row fetcher cache.
func newRowFetcherCache(
ctx context.Context,
@@ -96,7 +82,7 @@ func newRowFetcherCache(
leaseMgr: leaseMgr,
collection: cf.NewCollection(ctx, nil /* TemporarySchemaProvider */, nil /* monitor */),
db: db,
- fetchers: cache.NewUnorderedCache(defaultCacheConfig),
+ fetchers: cache.NewUnorderedCache(DefaultCacheConfig),
watchedFamilies: watchedFamilies,
}, err
}
@@ -192,7 +178,7 @@ var ErrUnwatchedFamily = errors.New("watched table but unwatched family")
func (c *rowFetcherCache) RowFetcherForColumnFamily(
tableDesc catalog.TableDescriptor, family descpb.FamilyID,
) (*row.Fetcher, *descpb.ColumnFamilyDescriptor, error) {
- idVer := idVersion{id: tableDesc.GetID(), version: tableDesc.GetVersion(), family: family}
+ idVer := CacheKey{ID: tableDesc.GetID(), Version: tableDesc.GetVersion(), FamilyID: family}
if v, ok := c.fetchers.Get(idVer); ok {
f := v.(*cachedFetcher)
if f.skip {
diff --git a/pkg/ccl/changefeedccl/cdcevent/version_cache.go b/pkg/ccl/changefeedccl/cdcevent/version_cache.go
new file mode 100644
index 000000000000..fabcb5ee68b3
--- /dev/null
+++ b/pkg/ccl/changefeedccl/cdcevent/version_cache.go
@@ -0,0 +1,42 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Licensed as a CockroachDB Enterprise file under the Cockroach Community
+// License (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
+
+package cdcevent
+
+import (
+ "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
+ "github.com/cockroachdb/cockroach/pkg/util/cache"
+)
+
+// DefaultCacheConfig is the default configuration for unordered cache.
+var DefaultCacheConfig = cache.Config{
+ Policy: cache.CacheFIFO,
+ // TODO: If we find ourselves thrashing here in changefeeds on many tables,
+ // we can improve performance by eagerly evicting versions using Resolved notifications.
+ // A old Version with a timestamp entirely before a notification can be safely evicted.
+ ShouldEvict: func(size int, _ interface{}, _ interface{}) bool { return size > 1024 },
+}
+
+// CacheKey is the key for the event caches.
+type CacheKey struct {
+ ID descpb.ID
+ Version descpb.DescriptorVersion
+ FamilyID descpb.FamilyID
+}
+
+// GetCachedOrCreate returns cached object, or creates and caches new one.
+func GetCachedOrCreate(
+ k CacheKey, c *cache.UnorderedCache, creator func() interface{},
+) interface{} {
+ if v, ok := c.Get(k); ok {
+ return v
+ }
+ v := creator()
+ c.Add(k, v)
+ return v
+}
diff --git a/pkg/ccl/changefeedccl/changefeed_dist.go b/pkg/ccl/changefeedccl/changefeed_dist.go
index 91210a4e10a9..3952e4729121 100644
--- a/pkg/ccl/changefeedccl/changefeed_dist.go
+++ b/pkg/ccl/changefeedccl/changefeed_dist.go
@@ -10,12 +10,15 @@ package changefeedccl
import (
"context"
+ "sort"
"time"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdceval"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
+ "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
+ "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql"
@@ -29,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
+ "github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/errors"
@@ -307,6 +311,14 @@ func startDistChangefeed(
return err
}
+var enableBalancedRangeDistribution = settings.RegisterBoolSetting(
+ settings.TenantWritable,
+ "changefeed.balance_range_distribution.enable",
+ "if enabled, the ranges are balanced equally among all nodes",
+ util.ConstantWithMetamorphicTestBool(
+ "changefeed.balance_range_distribution.enable", false),
+).WithPublic()
+
func makePlan(
execCtx sql.JobExecContext,
jobID jobspb.JobID,
@@ -316,24 +328,41 @@ func makePlan(
trackedSpans []roachpb.Span,
selectClause string,
) func(context.Context, *sql.DistSQLPlanner) (*sql.PhysicalPlan, *sql.PlanningCtx, error) {
-
return func(ctx context.Context, dsp *sql.DistSQLPlanner) (*sql.PhysicalPlan, *sql.PlanningCtx, error) {
var blankTxn *kv.Txn
+ distMode := sql.DistributionTypeAlways
+ if details.SinkURI == `` {
+ // Sinkless feeds get one ChangeAggregator on this node.
+ distMode = sql.DistributionTypeNone
+ }
+
planCtx := dsp.NewPlanningCtx(ctx, execCtx.ExtendedEvalContext(), nil /* planner */, blankTxn,
- sql.DistributionTypeAlways)
+ sql.DistributionType(distMode))
+ spanPartitions, err := dsp.PartitionSpans(ctx, planCtx, trackedSpans)
+ if err != nil {
+ return nil, nil, err
+ }
- var spanPartitions []sql.SpanPartition
- if details.SinkURI == `` {
- // Sinkless feeds get one ChangeAggregator on the gateway.
- spanPartitions = []sql.SpanPartition{{SQLInstanceID: dsp.GatewayID(), Spans: trackedSpans}}
- } else {
- // All other feeds get a ChangeAggregator local on the leaseholder.
- var err error
- spanPartitions, err = dsp.PartitionSpans(ctx, planCtx, trackedSpans)
+ sv := &execCtx.ExecCfg().Settings.SV
+ if enableBalancedRangeDistribution.Get(sv) {
+ scanType, err := changefeedbase.MakeStatementOptions(details.Opts).GetInitialScanType()
if err != nil {
return nil, nil, err
}
+
+ // Currently, balanced range distribution supported only in export mode.
+ // TODO(yevgeniy): Consider lifting this restriction.
+ if scanType == changefeedbase.OnlyInitialScan {
+ sender := execCtx.ExecCfg().DB.NonTransactionalSender()
+ distSender := sender.(*kv.CrossRangeTxnWrapperSender).Wrapped().(*kvcoord.DistSender)
+
+ spanPartitions, err = rebalanceSpanPartitions(
+ ctx, &distResolver{distSender}, rebalanceThreshold.Get(sv), spanPartitions)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
}
// Use the same checkpoint for all aggregators; each aggregator will only look at
@@ -442,3 +471,88 @@ func (w *changefeedResultWriter) SetError(err error) {
func (w *changefeedResultWriter) Err() error {
return w.err
}
+
+var rebalanceThreshold = settings.RegisterFloatSetting(
+ settings.TenantWritable,
+ "changefeed.balance_range_distribution.sensitivity",
+ "rebalance if the number of ranges on a node exceeds the average by this fraction",
+ 0.05,
+ settings.PositiveFloat,
+)
+
+type rangeResolver interface {
+ getRangesForSpans(ctx context.Context, spans []roachpb.Span) ([]roachpb.Span, error)
+}
+
+type distResolver struct {
+ *kvcoord.DistSender
+}
+
+func (r *distResolver) getRangesForSpans(
+ ctx context.Context, spans []roachpb.Span,
+) ([]roachpb.Span, error) {
+ return kvfeed.AllRangeSpans(ctx, r.DistSender, spans)
+}
+
+func rebalanceSpanPartitions(
+ ctx context.Context, r rangeResolver, sensitivity float64, p []sql.SpanPartition,
+) ([]sql.SpanPartition, error) {
+ if len(p) <= 1 {
+ return p, nil
+ }
+
+ // Explode set of spans into set of ranges.
+ // TODO(yevgeniy): This might not be great if the tables are huge.
+ numRanges := 0
+ for i := range p {
+ spans, err := r.getRangesForSpans(ctx, p[i].Spans)
+ if err != nil {
+ return nil, err
+ }
+ p[i].Spans = spans
+ numRanges += len(spans)
+ }
+
+ // Sort descending based on the number of ranges.
+ sort.Slice(p, func(i, j int) bool {
+ return len(p[i].Spans) > len(p[j].Spans)
+ })
+
+ targetRanges := int((1 + sensitivity) * float64(numRanges) / float64(len(p)))
+
+ for i, j := 0, len(p)-1; i < j && len(p[i].Spans) > targetRanges && len(p[j].Spans) < targetRanges; {
+ from, to := i, j
+
+ // Figure out how many ranges we can move.
+ numToMove := len(p[from].Spans) - targetRanges
+ canMove := targetRanges - len(p[to].Spans)
+ if numToMove <= canMove {
+ i++
+ }
+ if canMove <= numToMove {
+ numToMove = canMove
+ j--
+ }
+ if numToMove == 0 {
+ break
+ }
+
+ // Move numToMove spans from 'from' to 'to'.
+ idx := len(p[from].Spans) - numToMove
+ p[to].Spans = append(p[to].Spans, p[from].Spans[idx:]...)
+ p[from].Spans = p[from].Spans[:idx]
+ }
+
+ // Collapse ranges into nice set of contiguous spans.
+ for i := range p {
+ var g roachpb.SpanGroup
+ g.Add(p[i].Spans...)
+ p[i].Spans = g.Slice()
+ }
+
+ // Finally, re-sort based on the node id.
+ sort.Slice(p, func(i, j int) bool {
+ return p[i].SQLInstanceID < p[j].SQLInstanceID
+ })
+ return p, nil
+}
diff --git a/pkg/ccl/changefeedccl/changefeed_processors.go b/pkg/ccl/changefeedccl/changefeed_processors.go
index c25663ffcaeb..d447a7a6d523 100644
--- a/pkg/ccl/changefeedccl/changefeed_processors.go
+++ b/pkg/ccl/changefeedccl/changefeed_processors.go
@@ -60,8 +60,6 @@ type changeAggregator struct {
kvFeedDoneCh chan struct{}
kvFeedMemMon *mon.BytesMonitor
- // encoder is the Encoder to use for key and value serialization.
- encoder Encoder
// sink is the Sink to write rows to. Resolved timestamps are never written
// by changeAggregator.
sink EventSink
@@ -80,7 +78,7 @@ type changeAggregator struct {
// eventProducer produces the next event from the kv feed.
eventProducer kvevent.Reader
// eventConsumer consumes the event.
- eventConsumer *kvEventToRowConsumer
+ eventConsumer eventConsumer
// lastFlush and flushFrequency keep track of the flush frequency.
lastFlush time.Time
@@ -93,7 +91,6 @@ type changeAggregator struct {
metrics *Metrics
sliMetrics *sliMetrics
knobs TestingKnobs
- topicNamer *TopicNamer
}
type timestampLowerBoundOracle interface {
@@ -162,22 +159,6 @@ func newChangeAggregatorProcessor(
opts := changefeedbase.MakeStatementOptions(ca.spec.Feed.Opts)
- var err error
- encodingOpts, err := opts.GetEncodingOptions()
- if err != nil {
- return nil, err
- }
- if ca.encoder, err = getEncoder(encodingOpts, AllTargets(ca.spec.Feed)); err != nil {
- return nil, err
- }
-
- if encodingOpts.TopicInValue {
- ca.topicNamer, err = MakeTopicNamer(AllTargets(ca.spec.Feed))
- if err != nil {
- return nil, err
- }
- }
-
// MinCheckpointFrequency controls how frequently the changeAggregator flushes the sink
// and checkpoints the local frontier to changeFrontier. It is used as a rough
// approximation of how latency-sensitive the changefeed user is. For a high latency
@@ -225,7 +206,6 @@ func (ca *changeAggregator) Start(ctx context.Context) {
feed := makeChangefeedConfigFromJobDetails(ca.spec.Feed)
- endTime := feed.EndTime
opts := feed.Opts
if err != nil {
@@ -295,7 +275,7 @@ func (ca *changeAggregator) Start(ctx context.Context) {
kvFeedHighWater = ca.spec.Feed.StatementTime
}
- ca.eventProducer, err = ca.startKVFeed(ctx, spans, kvFeedHighWater, needsInitialScan, endTime)
+ ca.eventProducer, err = ca.startKVFeed(ctx, spans, kvFeedHighWater, needsInitialScan, feed)
if err != nil {
// Early abort in the case that there is an error creating the sink.
ca.MoveToDraining(err)
@@ -303,9 +283,9 @@ func (ca *changeAggregator) Start(ctx context.Context) {
return
}
- ca.eventConsumer, err = newKVEventToRowConsumer(
- ctx, ca.flowCtx.Cfg, ca.flowCtx.EvalCtx, ca.frontier.SpanFrontier(), kvFeedHighWater,
- ca.sink, ca.encoder, feed, ca.spec.Select, ca.knobs, ca.topicNamer)
+ ca.eventConsumer, ca.sink, err = newEventConsumer(
+ ctx, ca.flowCtx, feed, ca.frontier.SpanFrontier(), kvFeedHighWater,
+ ca.sink, feed, ca.spec.Select, ca.knobs, ca.metrics, ca.isSinkless())
if err != nil {
// Early abort in the case that there is an error setting up the consumption.
@@ -320,7 +300,7 @@ func (ca *changeAggregator) startKVFeed(
spans []roachpb.Span,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
- endTime hlc.Timestamp,
+ config ChangefeedConfig,
) (kvevent.Reader, error) {
cfg := ca.flowCtx.Cfg
buf := kvevent.NewThrottlingBuffer(
@@ -329,7 +309,7 @@ func (ca *changeAggregator) startKVFeed(
// KVFeed takes ownership of the kvevent.Writer portion of the buffer, while
// we return the kvevent.Reader part to the caller.
- kvfeedCfg, err := ca.makeKVFeedCfg(ctx, spans, buf, initialHighWater, needsInitialScan, endTime)
+ kvfeedCfg, err := ca.makeKVFeedCfg(ctx, config, spans, buf, initialHighWater, needsInitialScan)
if err != nil {
return nil, err
}
@@ -359,28 +339,27 @@ func (ca *changeAggregator) startKVFeed(
func (ca *changeAggregator) makeKVFeedCfg(
ctx context.Context,
+ config ChangefeedConfig,
spans []roachpb.Span,
buf kvevent.Writer,
initialHighWater hlc.Timestamp,
needsInitialScan bool,
- endTime hlc.Timestamp,
) (kvfeed.Config, error) {
- opts := changefeedbase.MakeStatementOptions(ca.spec.Feed.Opts)
- schemaChange, err := opts.GetSchemaChangeHandlingOptions()
+ schemaChange, err := config.Opts.GetSchemaChangeHandlingOptions()
if err != nil {
return kvfeed.Config{}, err
}
- filters := opts.GetFilters()
+ filters := config.Opts.GetFilters()
cfg := ca.flowCtx.Cfg
- initialScanOnly := endTime.EqOrdering(initialHighWater)
+ initialScanOnly := config.EndTime.EqOrdering(initialHighWater)
var sf schemafeed.SchemaFeed
if schemaChange.Policy == changefeedbase.OptSchemaChangePolicyIgnore || initialScanOnly {
sf = schemafeed.DoNothingSchemaFeed
} else {
sf = schemafeed.New(ctx, cfg, schemaChange.EventClass, AllTargets(ca.spec.Feed),
- initialHighWater, &ca.metrics.SchemaFeedMetrics, opts.GetCanHandle())
+ initialHighWater, &ca.metrics.SchemaFeedMetrics, config.Opts.GetCanHandle())
}
return kvfeed.Config{
@@ -399,7 +378,7 @@ func (ca *changeAggregator) makeKVFeedCfg(
OnBackfillRangeCallback: ca.sliMetrics.getBackfillRangeCallback(),
MM: ca.kvFeedMemMon,
InitialHighWater: initialHighWater,
- EndTime: endTime,
+ EndTime: config.EndTime,
WithDiff: filters.WithDiff,
NeedsInitialScan: needsInitialScan,
SchemaChangeEvents: schemaChange.EventClass,
@@ -467,12 +446,17 @@ func (ca *changeAggregator) close() {
if ca.kvFeedDoneCh != nil {
<-ca.kvFeedDoneCh
}
+ if ca.eventConsumer != nil {
+ if err := ca.eventConsumer.Close(); err != nil {
+ log.Warningf(ca.Ctx, "error closing event consumer: %s", err)
+ }
+ }
+
if ca.sink != nil {
if err := ca.sink.Close(); err != nil {
log.Warningf(ca.Ctx, `error closing sink. goroutines may have leaked: %v`, err)
}
}
-
ca.memAcc.Close(ca.Ctx)
if ca.kvFeedMemMon != nil {
ca.kvFeedMemMon.Stop(ca.Ctx)
@@ -584,7 +568,7 @@ func (ca *changeAggregator) noteResolvedSpan(resolved jobspb.ResolvedSpan) error
// flushFrontier flushes sink and emits resolved timestamp if needed.
func (ca *changeAggregator) flushFrontier() error {
- // Make sure to flush the sink before forwarding resolved spans,
+ // Make sure to the sink before forwarding resolved spans,
// otherwise, we could lose buffered messages and violate the
// at-least-once guarantee. This is also true for checkpointing the
// resolved spans in the job progress.
@@ -639,6 +623,10 @@ func (ca *changeAggregator) ConsumerClosed() {
ca.close()
}
+func (ca *changeAggregator) isSinkless() bool {
+ return ca.spec.JobID == 0
+}
+
const (
emitAllResolved = 0
emitNoResolved = -1
diff --git a/pkg/ccl/changefeedccl/changefeed_stmt.go b/pkg/ccl/changefeedccl/changefeed_stmt.go
index 8f344fef50f4..810fa3a420a7 100644
--- a/pkg/ccl/changefeedccl/changefeed_stmt.go
+++ b/pkg/ccl/changefeedccl/changefeed_stmt.go
@@ -525,7 +525,8 @@ func createChangefeedJobRecord(
// the default error to avoid claiming the user set an option they didn't
// explicitly set. Fortunately we know the only way to cause this is to
// set envelope.
- if isCloudStorageSink(parsedSink) || isWebhookSink(parsedSink) {
+ if (isCloudStorageSink(parsedSink) || isWebhookSink(parsedSink)) &&
+ encodingOpts.Envelope != changefeedbase.OptEnvelopeBare {
if err = opts.ForceKeyInValue(); err != nil {
return nil, errors.Errorf(`this sink is incompatible with envelope=%s`, encodingOpts.Envelope)
}
diff --git a/pkg/ccl/changefeedccl/changefeed_test.go b/pkg/ccl/changefeedccl/changefeed_test.go
index a81d9ff00f15..1dc1252d04f4 100644
--- a/pkg/ccl/changefeedccl/changefeed_test.go
+++ b/pkg/ccl/changefeedccl/changefeed_test.go
@@ -1048,7 +1048,7 @@ func TestChangefeedProjectionDelete(t *testing.T) {
`foo: [0]->{}`,
})
}
- cdcTest(t, testFn)
+ cdcTest(t, testFn, feedTestForceSink("cloudstorage"))
}
// If we drop columns which are not targeted by the changefeed, it should not backfill.
@@ -2476,9 +2476,7 @@ func TestChangefeedBareJSON(t *testing.T) {
sqlDB.Exec(t, `INSERT INTO foo values (0, 'dog')`)
foo := feed(t, f, `CREATE CHANGEFEED WITH schema_change_policy=stop AS SELECT * FROM foo`)
defer closeFeed(t, foo)
- assertPayloads(t, foo, []string{
- `foo: [0]->{"a": 0, "b": "dog"}`,
- })
+ assertPayloads(t, foo, []string{`foo: [0]->{"a": 0, "b": "dog"}`})
}
cdcTest(t, testFn, feedTestForceSink("kafka"))
cdcTest(t, testFn, feedTestForceSink("enterprise"))
@@ -7427,3 +7425,84 @@ func TestChangefeedKafkaMessageTooLarge(t *testing.T) {
cdcTest(t, testFn, feedTestForceSink(`kafka`))
}
+
+type echoResolver struct {
+ result []roachpb.Spans
+ pos int
+}
+
+func (r *echoResolver) getRangesForSpans(
+ _ context.Context, _ []roachpb.Span,
+) (spans []roachpb.Span, _ error) {
+ spans = r.result[r.pos]
+ r.pos++
+ return spans, nil
+}
+
+func TestPartitionSpans(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+
+ partitions := func(p ...sql.SpanPartition) []sql.SpanPartition {
+ return p
+ }
+ mkPart := func(n base.SQLInstanceID, spans ...roachpb.Span) sql.SpanPartition {
+ return sql.SpanPartition{SQLInstanceID: n, Spans: spans}
+ }
+ mkSpan := func(start, end string) roachpb.Span {
+ return roachpb.Span{Key: []byte(start), EndKey: []byte(end)}
+ }
+ spans := func(s ...roachpb.Span) roachpb.Spans {
+ return s
+ }
+ const sensitivity = 0.01
+
+ for i, tc := range []struct {
+ input []sql.SpanPartition
+ resolve []roachpb.Spans
+ expect []sql.SpanPartition
+ }{
+ {
+ input: partitions(
+ mkPart(1, mkSpan("a", "j")),
+ mkPart(2, mkSpan("j", "q")),
+ mkPart(3, mkSpan("q", "z")),
+ ),
+ // 6 total ranges, 2 per node.
+ resolve: []roachpb.Spans{
+ spans(mkSpan("a", "c"), mkSpan("c", "e"), mkSpan("e", "j")),
+ spans(mkSpan("j", "q")),
+ spans(mkSpan("q", "y"), mkSpan("y", "z")),
+ },
+ expect: partitions(
+ mkPart(1, mkSpan("a", "e")),
+ mkPart(2, mkSpan("e", "q")),
+ mkPart(3, mkSpan("q", "z")),
+ ),
+ },
+ {
+ input: partitions(
+ mkPart(1, mkSpan("a", "c"), mkSpan("e", "p"), mkSpan("r", "z")),
+ mkPart(2),
+ mkPart(3, mkSpan("c", "e"), mkSpan("p", "r")),
+ ),
+ // 5 total ranges -- on 2 nodes; target should be 1 per node.
+ resolve: []roachpb.Spans{
+ spans(mkSpan("a", "c"), mkSpan("e", "p"), mkSpan("r", "z")),
+ spans(),
+ spans(mkSpan("c", "e"), mkSpan("p", "r")),
+ },
+ expect: partitions(
+ mkPart(1, mkSpan("a", "c"), mkSpan("e", "p")),
+ mkPart(2, mkSpan("r", "z")),
+ mkPart(3, mkSpan("c", "e"), mkSpan("p", "r")),
+ ),
+ },
+ } {
+ t.Run(strconv.Itoa(i), func(t *testing.T) {
+ sp, err := rebalanceSpanPartitions(context.Background(),
+ &echoResolver{result: tc.resolve}, sensitivity, tc.input)
+ require.NoError(t, err)
+ require.Equal(t, tc.expect, sp)
+ })
+ }
+}
diff --git a/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel b/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel
index c5173f5e92e1..4bb125e722a1 100644
--- a/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel
+++ b/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel
@@ -21,6 +21,7 @@ go_library(
"//pkg/sql",
"//pkg/sql/catalog/descpb",
"//pkg/sql/flowinfra",
+ "//pkg/util",
"@com_github_cockroachdb_errors//:errors",
],
)
diff --git a/pkg/ccl/changefeedccl/changefeedbase/options.go b/pkg/ccl/changefeedccl/changefeedbase/options.go
index ec6c80cdbf24..d132778fb90c 100644
--- a/pkg/ccl/changefeedccl/changefeedbase/options.go
+++ b/pkg/ccl/changefeedccl/changefeedbase/options.go
@@ -297,7 +297,7 @@ var ChangefeedOptionExpectValues = map[string]OptionPermittedValues{
OptUpdatedTimestamps: flagOption,
OptMVCCTimestamps: flagOption,
OptDiff: flagOption,
- OptCompression: enum("gzip"),
+ OptCompression: enum("gzip", "zstd"),
OptSchemaChangeEvents: enum("column_changes", "default"),
OptSchemaChangePolicy: enum("backfill", "nobackfill", "stop", "ignore"),
OptSplitColumnFamilies: flagOption,
diff --git a/pkg/ccl/changefeedccl/changefeedbase/settings.go b/pkg/ccl/changefeedccl/changefeedbase/settings.go
index fb9ae3a8bba2..5169dc1aadce 100644
--- a/pkg/ccl/changefeedccl/changefeedbase/settings.go
+++ b/pkg/ccl/changefeedccl/changefeedbase/settings.go
@@ -13,6 +13,7 @@ import (
"time"
"github.com/cockroachdb/cockroach/pkg/settings"
+ "github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/errors"
)
@@ -231,3 +232,23 @@ var UseMuxRangeFeed = settings.RegisterBoolSetting(
"if true, changefeed uses multiplexing rangefeed RPC",
false,
)
+
+// EventConsumerWorkers specifies the maximum number of workers to use when
+// processing events.
+var EventConsumerWorkers = settings.RegisterIntSetting(
+ settings.TenantWritable,
+ "changefeed.event_consumer_workers",
+ "the number of workers to use when processing events: <0 disables, "+
+ "0 assigns a reasonable default, >0 assigns the setting value",
+ 0,
+).WithPublic()
+
+// EventConsumerWorkerQueueSize specifies the maximum number of events a worker buffer.
+var EventConsumerWorkerQueueSize = settings.RegisterIntSetting(
+ settings.TenantWritable,
+ "changefeed.event_consumer_worker_queue_size",
+ "if changefeed.event_consumer_workers is enabled, this setting sets the maxmimum number of events"+
+ "which a worker can buffer",
+ int64(util.ConstantWithMetamorphicTestRange("changefeed.event_consumer_worker_queue_size", 16, 0, 16)),
+ settings.NonNegativeInt,
+).WithPublic()
diff --git a/pkg/ccl/changefeedccl/compression.go b/pkg/ccl/changefeedccl/compression.go
new file mode 100644
index 000000000000..c4d744a1445b
--- /dev/null
+++ b/pkg/ccl/changefeedccl/compression.go
@@ -0,0 +1,70 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Licensed as a CockroachDB Enterprise file under the Cockroach Community
+// License (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
+
+package changefeedccl
+
+import (
+ stdgzip "compress/gzip"
+ "io"
+ "strings"
+
+ "github.com/cockroachdb/cockroach/pkg/settings"
+ "github.com/cockroachdb/cockroach/pkg/util"
+ "github.com/cockroachdb/errors"
+ "github.com/klauspost/compress/zstd"
+ "github.com/klauspost/pgzip"
+)
+
+var useFastGzip = settings.RegisterBoolSetting(
+ settings.TenantWritable,
+ "changefeed.fast_gzip.enabled",
+ "use fast gzip implementation",
+ util.ConstantWithMetamorphicTestBool(
+ "changefeed.fast_gzip.enabled", true,
+ ),
+).WithPublic()
+
+type compressionAlgo string
+
+const sinkCompressionGzip compressionAlgo = "gzip"
+const sinkCompressionZstd compressionAlgo = "zstd"
+
+func (a compressionAlgo) enabled() bool {
+ return a != ""
+}
+
+// newCompressionCodec returns compression codec for the specified algorithm,
+// which writes compressed data to the destination.
+// TODO(yevgeniy): Support compression configuration (level, speed, etc).
+// TODO(yevgeniy): Add telemetry.
+func newCompressionCodec(
+ algo compressionAlgo, sv *settings.Values, dest io.Writer,
+) (io.WriteCloser, error) {
+ switch algo {
+ case sinkCompressionGzip:
+ if useFastGzip.Get(sv) {
+ return pgzip.NewWriterLevel(dest, pgzip.DefaultCompression)
+ }
+ return stdgzip.NewWriterLevel(dest, stdgzip.DefaultCompression)
+ case sinkCompressionZstd:
+ return zstd.NewWriter(dest, zstd.WithEncoderLevel(zstd.SpeedFastest))
+ default:
+ return nil, errors.AssertionFailedf("unsupported compression algorithm %q", algo)
+ }
+}
+
+// compressionFromString returns compression algorithm type along with file extension.
+func compressionFromString(algo string) (_ compressionAlgo, ext string, _ error) {
+ if strings.EqualFold(algo, string(sinkCompressionGzip)) {
+ return sinkCompressionGzip, ".gz", nil
+ }
+ if strings.EqualFold(algo, string(sinkCompressionZstd)) {
+ return sinkCompressionZstd, ".zst", nil
+ }
+ return "", "", errors.AssertionFailedf("unsupported compression algorithm %q", algo)
+}
diff --git a/pkg/ccl/changefeedccl/encoder.go b/pkg/ccl/changefeedccl/encoder.go
index a954fb41c4bb..c08f4dff42e1 100644
--- a/pkg/ccl/changefeedccl/encoder.go
+++ b/pkg/ccl/changefeedccl/encoder.go
@@ -46,7 +46,7 @@ func getEncoder(
) (Encoder, error) {
switch opts.Format {
case changefeedbase.OptFormatJSON:
- return makeJSONEncoder(opts, targets)
+ return makeJSONEncoder(opts)
case changefeedbase.OptFormatAvro, changefeedbase.DeprecatedOptFormatAvro:
return newConfluentAvroEncoder(opts, targets)
case changefeedbase.OptFormatCSV:
diff --git a/pkg/ccl/changefeedccl/encoder_json.go b/pkg/ccl/changefeedccl/encoder_json.go
index 2cd770818372..03f1a96ba1fe 100644
--- a/pkg/ccl/changefeedccl/encoder_json.go
+++ b/pkg/ccl/changefeedccl/encoder_json.go
@@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
+ "github.com/cockroachdb/cockroach/pkg/util/cache"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/json"
"github.com/cockroachdb/errors"
@@ -32,8 +33,9 @@ type jsonEncoder struct {
updatedField, mvccTimestampField, beforeField, keyInValue, topicInValue bool
envelopeType changefeedbase.EnvelopeType
- targets changefeedbase.Targets
- buf bytes.Buffer
+ buf bytes.Buffer
+ versionEncoder func(ed *cdcevent.EventDescriptor) *versionEncoder
+ envelopeEncoder func(evCtx eventContext, updated, prev cdcevent.Row) (json.JSON, error)
}
var _ Encoder = &jsonEncoder{}
@@ -45,20 +47,29 @@ func canJSONEncodeMetadata(e changefeedbase.EnvelopeType) bool {
return e == changefeedbase.OptEnvelopeBare || e == changefeedbase.OptEnvelopeWrapped
}
-func makeJSONEncoder(
- opts changefeedbase.EncodingOptions, targets changefeedbase.Targets,
-) (*jsonEncoder, error) {
+func makeJSONEncoder(opts changefeedbase.EncodingOptions) (*jsonEncoder, error) {
+ versionCache := cache.NewUnorderedCache(cdcevent.DefaultCacheConfig)
e := &jsonEncoder{
- targets: targets,
- envelopeType: opts.Envelope,
- }
- e.updatedField = opts.UpdatedTimestamps
- e.mvccTimestampField = opts.MVCCTimestamps
- // In the bare envelope we don't output diff directly, it's incorporated into the
- // projection as desired.
- e.beforeField = opts.Diff && opts.Envelope != changefeedbase.OptEnvelopeBare
- e.keyInValue = opts.KeyInValue
- e.topicInValue = opts.TopicInValue
+ envelopeType: opts.Envelope,
+ updatedField: opts.UpdatedTimestamps,
+ mvccTimestampField: opts.MVCCTimestamps,
+ // In the bare envelope we don't output diff directly, it's incorporated into the
+ // projection as desired.
+ beforeField: opts.Diff && opts.Envelope != changefeedbase.OptEnvelopeBare,
+ keyInValue: opts.KeyInValue,
+ topicInValue: opts.TopicInValue,
+ versionEncoder: func(ed *cdcevent.EventDescriptor) *versionEncoder {
+ key := cdcevent.CacheKey{
+ ID: ed.TableID,
+ Version: ed.Version,
+ FamilyID: ed.FamilyID,
+ }
+ return cdcevent.GetCachedOrCreate(key, versionCache, func() interface{} {
+ return &versionEncoder{}
+ }).(*versionEncoder)
+ },
+ }
+
if !canJSONEncodeMetadata(e.envelopeType) {
if e.keyInValue {
return nil, errors.Errorf(`%s is only usable with %s=%s`,
@@ -69,16 +80,28 @@ func makeJSONEncoder(
changefeedbase.OptTopicInValue, changefeedbase.OptEnvelope, changefeedbase.OptEnvelopeWrapped)
}
}
+
+ if e.envelopeType == changefeedbase.OptEnvelopeWrapped {
+ if err := e.initWrappedEnvelope(); err != nil {
+ return nil, err
+ }
+ } else {
+ if err := e.initRawEnvelope(); err != nil {
+ return nil, err
+ }
+ }
+
return e, nil
}
+// versionEncoder memoizes version specific encoding state.
+type versionEncoder struct {
+ valueBuilder *json.FixedKeysObjectBuilder
+}
+
// EncodeKey implements the Encoder interface.
func (e *jsonEncoder) EncodeKey(_ context.Context, row cdcevent.Row) ([]byte, error) {
- jsonEntries, err := e.encodeKeyRaw(row)
- if err != nil {
- return nil, err
- }
- j, err := json.MakeJSON(jsonEntries)
+ j, err := e.versionEncoder(row.EventDescriptor).encodeKeyRaw(row)
if err != nil {
return nil, err
}
@@ -87,126 +110,243 @@ func (e *jsonEncoder) EncodeKey(_ context.Context, row cdcevent.Row) ([]byte, er
return e.buf.Bytes(), nil
}
-func (e *jsonEncoder) encodeKeyRaw(row cdcevent.Row) ([]interface{}, error) {
- var jsonEntries []interface{}
+func (e *versionEncoder) encodeKeyRaw(row cdcevent.Row) (json.JSON, error) {
+ kb := json.NewArrayBuilder(1)
if err := row.ForEachKeyColumn().Datum(func(d tree.Datum, col cdcevent.ResultColumn) error {
j, err := tree.AsJSON(d, sessiondatapb.DataConversionConfig{}, time.UTC)
if err != nil {
return err
}
- jsonEntries = append(jsonEntries, j)
+ kb.Add(j)
return nil
}); err != nil {
return nil, err
}
- return jsonEntries, nil
+ return kb.Build(), nil
}
-func rowAsGoNative(row cdcevent.Row) (map[string]interface{}, error) {
+func (e *versionEncoder) encodeKeyInValue(
+ updated cdcevent.Row, b *json.FixedKeysObjectBuilder,
+) error {
+ keyEntries, err := e.encodeKeyRaw(updated)
+ if err != nil {
+ return err
+ }
+ return b.Set("key", keyEntries)
+}
+
+var emptyJSONValue = func() json.JSON {
+ j, err := json.MakeJSON(map[string]interface{}{})
+ if err != nil {
+ panic(err)
+ }
+ return j
+}()
+
+func (e *versionEncoder) rowAsGoNative(row cdcevent.Row, meta json.JSON) (json.JSON, error) {
if !row.HasValues() || row.IsDeleted() {
- return nil, nil
+ if meta != nil {
+ b := json.NewObjectBuilder(1)
+ b.Add(jsonMetaSentinel, meta)
+ return b.Build(), nil
+ }
+ return json.NullJSONValue, nil
}
- result := make(map[string]interface{})
- if err := row.ForEachColumn().Datum(func(d tree.Datum, col cdcevent.ResultColumn) (err error) {
- result[col.Name], err = tree.AsJSON(d, sessiondatapb.DataConversionConfig{}, time.UTC)
- return err
+ if e.valueBuilder == nil {
+ keys := make([]string, 0, len(row.ResultColumns()))
+ _ = row.ForEachColumn().Col(func(col cdcevent.ResultColumn) error {
+ keys = append(keys, col.Name)
+ return nil
+ })
+ if meta != nil {
+ keys = append(keys, jsonMetaSentinel)
+ }
+ b, err := json.NewFixedKeysObjectBuilder(keys)
+ if err != nil {
+ return nil, err
+ }
+ e.valueBuilder = b
+ }
+
+ if err := row.ForEachColumn().Datum(func(d tree.Datum, col cdcevent.ResultColumn) error {
+ j, err := tree.AsJSON(d, sessiondatapb.DataConversionConfig{}, time.UTC)
+ if err != nil {
+ return err
+ }
+ return e.valueBuilder.Set(col.Name, j)
}); err != nil {
return nil, err
}
- return result, nil
-}
-// EncodeValue implements the Encoder interface.
-func (e *jsonEncoder) EncodeValue(
- ctx context.Context, evCtx eventContext, updatedRow cdcevent.Row, prevRow cdcevent.Row,
-) ([]byte, error) {
- if e.envelopeType == changefeedbase.OptEnvelopeKeyOnly {
- return nil, nil
+ if meta != nil {
+ if err := e.valueBuilder.Set(jsonMetaSentinel, meta); err != nil {
+ return nil, err
+ }
}
- if updatedRow.IsDeleted() && !canJSONEncodeMetadata(e.envelopeType) {
- return nil, nil
- }
+ return e.valueBuilder.Build()
+}
- after, err := rowAsGoNative(updatedRow)
- if err != nil {
- return nil, err
+func (e *jsonEncoder) initRawEnvelope() error {
+ // Determine if we need to add crdb meta.
+ var metaKeys []string
+ if e.updatedField {
+ metaKeys = append(metaKeys, "updated")
+ }
+ if e.mvccTimestampField {
+ metaKeys = append(metaKeys, "mvcc_timestamp")
+ }
+ if e.keyInValue {
+ metaKeys = append(metaKeys, "key")
+ }
+ if e.topicInValue {
+ metaKeys = append(metaKeys, "topic")
}
- before, err := rowAsGoNative(prevRow)
- if err != nil {
- return nil, err
+ // Setup builder for crdb meta if needed.
+ var metaBuilder *json.FixedKeysObjectBuilder
+ if len(metaKeys) > 0 {
+ b, err := json.NewFixedKeysObjectBuilder(metaKeys)
+ if err != nil {
+ return err
+ }
+ metaBuilder = b
}
- var jsonEntries map[string]interface{}
- var meta map[string]interface{}
- if canJSONEncodeMetadata(e.envelopeType) {
- if e.envelopeType == changefeedbase.OptEnvelopeWrapped {
- if after != nil {
- jsonEntries = map[string]interface{}{`after`: after}
- } else {
- jsonEntries = map[string]interface{}{`after`: nil}
+ e.envelopeEncoder = func(evCtx eventContext, updated, _ cdcevent.Row) (_ json.JSON, err error) {
+ ve := e.versionEncoder(updated.EventDescriptor)
+ if len(metaKeys) == 0 {
+ if updated.IsDeleted() {
+ return emptyJSONValue, nil
}
- meta = jsonEntries
- } else {
- meta = make(map[string]interface{}, 1)
- if after != nil {
- jsonEntries = after
- } else {
- jsonEntries = map[string]interface{}{}
+ return ve.rowAsGoNative(updated, nil)
+ }
+
+ if e.updatedField {
+ if err := metaBuilder.Set("updated", json.FromString(evCtx.updated.AsOfSystemTime())); err != nil {
+ return nil, err
}
- jsonEntries[jsonMetaSentinel] = meta
}
- if e.beforeField {
- if before != nil {
- jsonEntries[`before`] = before
- } else {
- jsonEntries[`before`] = nil
+
+ if e.mvccTimestampField {
+ if err := metaBuilder.Set("mvcc_timestamp", json.FromString(evCtx.mvcc.AsOfSystemTime())); err != nil {
+ return nil, err
}
}
+
if e.keyInValue {
- keyEntries, err := e.encodeKeyRaw(updatedRow)
- if err != nil {
+ if err := ve.encodeKeyInValue(updated, metaBuilder); err != nil {
return nil, err
}
- jsonEntries[`key`] = keyEntries
}
+
if e.topicInValue {
- jsonEntries[`topic`] = evCtx.topic
+ if err := metaBuilder.Set("topic", json.FromString(evCtx.topic)); err != nil {
+ return nil, err
+ }
}
- } else {
- jsonEntries = after
+
+ meta, err := metaBuilder.Build()
+ if err != nil {
+ return nil, err
+ }
+ return ve.rowAsGoNative(updated, meta)
}
+ return nil
+}
- // TODO (zinger): Existing behavior special-cases these fields for
- // no particular reason. Fold this into the above block.
- if e.updatedField || e.mvccTimestampField {
- if meta == nil {
- if e.envelopeType == changefeedbase.OptEnvelopeWrapped {
- meta = jsonEntries
+func (e *jsonEncoder) initWrappedEnvelope() error {
+ keys := []string{"after"}
+ if e.beforeField {
+ keys = append(keys, "before")
+ }
+ if e.keyInValue {
+ keys = append(keys, "key")
+ }
+ if e.topicInValue {
+ keys = append(keys, "topic")
+ }
+ if e.updatedField {
+ keys = append(keys, "updated")
+ }
+ if e.mvccTimestampField {
+ keys = append(keys, "mvcc_timestamp")
+ }
+ b, err := json.NewFixedKeysObjectBuilder(keys)
+ if err != nil {
+ return err
+ }
+
+ e.envelopeEncoder = func(evCtx eventContext, updated, prev cdcevent.Row) (json.JSON, error) {
+ ve := e.versionEncoder(updated.EventDescriptor)
+ after, err := ve.rowAsGoNative(updated, nil)
+ if err != nil {
+ return nil, err
+ }
+ if err := b.Set("after", after); err != nil {
+ return nil, err
+ }
+
+ if e.beforeField {
+ var before json.JSON
+ if prev.IsInitialized() && !prev.IsDeleted() {
+ before, err = e.versionEncoder(prev.EventDescriptor).rowAsGoNative(prev, nil)
+ if err != nil {
+ return nil, err
+ }
} else {
- meta = make(map[string]interface{}, 1)
- jsonEntries[jsonMetaSentinel] = meta
+ before = json.NullJSONValue
+ }
+
+ if err := b.Set("before", before); err != nil {
+ return nil, err
+ }
+ }
+
+ if e.keyInValue {
+ if err := ve.encodeKeyInValue(updated, b); err != nil {
+ return nil, err
}
}
+
+ if e.topicInValue {
+ if err := b.Set("topic", json.FromString(evCtx.topic)); err != nil {
+ return nil, err
+ }
+ }
+
if e.updatedField {
- meta[`updated`] = evCtx.updated.AsOfSystemTime()
+ if err := b.Set("updated", json.FromString(evCtx.updated.AsOfSystemTime())); err != nil {
+ return nil, err
+ }
}
+
if e.mvccTimestampField {
- meta[`mvcc_timestamp`] = evCtx.mvcc.AsOfSystemTime()
+ if err := b.Set("mvcc_timestamp", json.FromString(evCtx.mvcc.AsOfSystemTime())); err != nil {
+ return nil, err
+ }
}
+
+ return b.Build()
}
+ return nil
+}
- if metaFields, ok := jsonEntries[jsonMetaSentinel]; ok {
- m, ok := metaFields.(map[string]interface{})
- if !ok || len(m) == 0 {
- delete(jsonEntries, jsonMetaSentinel)
- }
+// EncodeValue implements the Encoder interface.
+func (e *jsonEncoder) EncodeValue(
+ ctx context.Context, evCtx eventContext, updatedRow cdcevent.Row, prevRow cdcevent.Row,
+) ([]byte, error) {
+ if e.envelopeType == changefeedbase.OptEnvelopeKeyOnly {
+ return nil, nil
+ }
+
+ if updatedRow.IsDeleted() && !canJSONEncodeMetadata(e.envelopeType) {
+ return nil, nil
}
- j, err := json.MakeJSON(jsonEntries)
+ j, err := e.envelopeEncoder(evCtx, updatedRow, prevRow)
if err != nil {
return nil, err
}
diff --git a/pkg/ccl/changefeedccl/encoder_test.go b/pkg/ccl/changefeedccl/encoder_test.go
index 9337cfa5c93d..eaf680bd63f7 100644
--- a/pkg/ccl/changefeedccl/encoder_test.go
+++ b/pkg/ccl/changefeedccl/encoder_test.go
@@ -12,6 +12,7 @@ import (
"context"
gosql "database/sql"
"fmt"
+ "math/rand"
"net/url"
"testing"
@@ -20,13 +21,17 @@ import (
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdctest"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
+ "github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
+ "github.com/cockroachdb/cockroach/pkg/sql/randgen"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
+ "github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/cockroach/pkg/workload/ledger"
"github.com/cockroachdb/cockroach/pkg/workload/workloadsql"
"github.com/stretchr/testify/require"
@@ -595,6 +600,15 @@ func TestAvroSchemaNaming(t *testing.T) {
testFn := func(t *testing.T, s TestServer, f cdctest.TestFeedFactory) {
sqlDB := sqlutils.MakeSQLRunner(s.DB)
+
+ // The expected results depend on caching in the avro encoder.
+ // With multiple workers, there are multiple encoders which each
+ // maintain their own caches. Depending on the number of
+ // workers, the results below may change, so disable parallel workers
+ // here for simplicity.
+ changefeedbase.EventConsumerWorkers.Override(
+ context.Background(), &s.Server.ClusterSettings().SV, -1)
+
sqlDB.Exec(t, `CREATE DATABASE movr`)
sqlDB.Exec(t, `CREATE TABLE movr.drivers (id INT PRIMARY KEY, name STRING)`)
sqlDB.Exec(t,
@@ -841,3 +855,199 @@ func TestAvroLedger(t *testing.T) {
cdcTest(t, testFn, feedTestForceSink("kafka"))
}
+
+func BenchmarkEncoders(b *testing.B) {
+ rng := randutil.NewTestRandWithSeed(2365865412074131521)
+
+ // Initialize column types for tests; this is done before
+ // benchmark runs so that all reruns use the same types as generated
+ // by random seed above.
+ const maxKeyCols = 4
+ const maxValCols = 2< MarshalGeo -> go JSON bytes -> ParseJSON -> Go native -> json.JSON
+ // Benchmarking this generates too much noise.
+ // TODO: fix this.
+ case types.Void:
+ // Just not a very interesting thing to encode
+ default:
+ allTypes = append(allTypes, typ)
+ }
+ }
+
+ // Add tuple types.
+ var tupleTypes []*types.T
+ makeTupleType := func() *types.T {
+ contents := make([]*types.T, rng.Intn(6)) // Up to 6 fields
+ for i := range contents {
+ contents[i] = randgen.RandTypeFromSlice(rng, allTypes)
+ }
+ candidateTuple := types.MakeTuple(contents)
+ // Ensure tuple type is unique.
+ for _, t := range tupleTypes {
+ if t.Equal(candidateTuple) {
+ return nil
+ }
+ }
+ tupleTypes = append(tupleTypes, candidateTuple)
+ return candidateTuple
+ }
+
+ const numTupleTypes = 5
+ for i := 0; i < numTupleTypes; i++ {
+ var typ *types.T
+ for typ == nil {
+ typ = makeTupleType()
+ }
+ allTypes = append(allTypes, typ)
+ }
+
+ randTypes := func(numTypes int, mustBeKeyType bool) []*types.T {
+ typs := make([]*types.T, numTypes)
+ for i := range typs {
+ typ := randgen.RandTypeFromSlice(rng, allTypes)
+ for mustBeKeyType && colinfo.MustBeValueEncoded(typ) {
+ typ = randgen.RandTypeFromSlice(rng, allTypes)
+ }
+ typs[i] = typ
+ }
+ return typs
+ }
+
+ return randTypes(numKeyTypes, true), randTypes(numColTypes, false)
+}
diff --git a/pkg/ccl/changefeedccl/event_processing.go b/pkg/ccl/changefeedccl/event_processing.go
index 8114edf1df13..7af53ebfccb2 100644
--- a/pkg/ccl/changefeedccl/event_processing.go
+++ b/pkg/ccl/changefeedccl/event_processing.go
@@ -10,18 +10,25 @@ package changefeedccl
import (
"context"
+ "hash"
+ "hash/crc32"
+ "runtime"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdceval"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcevent"
+ "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvevent"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
+ "github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/span"
+ "github.com/cockroachdb/cockroach/pkg/util/syncutil"
+ "github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)
@@ -32,8 +39,16 @@ type eventContext struct {
topic string
}
+type eventConsumer interface {
+ ConsumeEvent(ctx context.Context, ev kvevent.Event) error
+ Close() error
+ Flush(ctx context.Context) error
+}
+
+type frontier interface{ Frontier() hlc.Timestamp }
+
type kvEventToRowConsumer struct {
- frontier *span.Frontier
+ frontier
encoder Encoder
scratch bufalloc.ByteAllocator
sink EventSink
@@ -48,11 +63,102 @@ type kvEventToRowConsumer struct {
topicNamer *TopicNamer
}
+func newEventConsumer(
+ ctx context.Context,
+ flowCtx *execinfra.FlowCtx,
+ feed ChangefeedConfig,
+ spanFrontier *span.Frontier,
+ cursor hlc.Timestamp,
+ sink EventSink,
+ details ChangefeedConfig,
+ expr execinfrapb.Expression,
+ knobs TestingKnobs,
+ metrics *Metrics,
+ isSinkless bool,
+) (eventConsumer, EventSink, error) {
+ cfg := flowCtx.Cfg
+ evalCtx := flowCtx.EvalCtx
+
+ makeConsumer := func(s EventSink, frontier frontier) (eventConsumer, error) {
+ var err error
+ encodingOpts, err := feed.Opts.GetEncodingOptions()
+ if err != nil {
+ return nil, err
+ }
+ encoder, err := getEncoder(encodingOpts, feed.Targets)
+ if err != nil {
+ return nil, err
+ }
+
+ var topicNamer *TopicNamer
+ if encodingOpts.TopicInValue {
+ topicNamer, err = MakeTopicNamer(feed.Targets)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return newKVEventToRowConsumer(ctx, cfg, evalCtx, frontier, cursor, s,
+ encoder, details, expr, knobs, topicNamer)
+ }
+
+ // TODO (jayshrivastava) enable parallel consumers for sinkless changefeeds
+ numWorkers := changefeedbase.EventConsumerWorkers.Get(&cfg.Settings.SV)
+ if numWorkers == 0 {
+ // Pick a reasonable default.
+ numWorkers = defaultNumWorkers()
+ }
+ if numWorkers <= 1 || isSinkless {
+ c, err := makeConsumer(sink, spanFrontier)
+ if err != nil {
+ return nil, nil, err
+ }
+ return c, sink, err
+ }
+
+ c := ¶llelEventConsumer{
+ g: ctxgroup.WithContext(ctx),
+ hasher: makeHasher(),
+ metrics: metrics,
+ termCh: make(chan struct{}),
+ flushCh: make(chan struct{}, 1),
+ doneCh: make(chan struct{}),
+ numWorkers: numWorkers,
+ workerCh: make([]chan kvevent.Event, numWorkers),
+ workerChSize: changefeedbase.EventConsumerWorkerQueueSize.Get(&cfg.Settings.SV),
+ spanFrontier: spanFrontier,
+ }
+ ss := &safeSink{wrapped: sink, beforeFlush: c.Flush}
+ c.makeConsumer = func() (eventConsumer, error) {
+ return makeConsumer(ss, c)
+ }
+
+ if err := c.startWorkers(); err != nil {
+ return nil, nil, err
+ }
+ return c, ss, nil
+}
+
+func defaultNumWorkers() int64 {
+ idealNumber := runtime.GOMAXPROCS(0) >> 2
+ if idealNumber < 1 {
+ return 1
+ }
+ if idealNumber > 8 {
+ return 8
+ }
+ return int64(idealNumber)
+}
+
+func makeHasher() hash.Hash32 {
+ return crc32.New(crc32.MakeTable(crc32.IEEE))
+}
+
func newKVEventToRowConsumer(
ctx context.Context,
cfg *execinfra.ServerConfig,
evalCtx *eval.Context,
- frontier *span.Frontier,
+ frontier frontier,
cursor hlc.Timestamp,
sink EventSink,
encoder Encoder,
@@ -229,9 +335,14 @@ func (c *kvEventToRowConsumer) ConsumeEvent(ctx context.Context, ev kvevent.Even
return err
}
}
+
+ // Since we're done processing/converting this event, and will not use much more
+ // than len(key)+len(bytes) worth of resources, adjust allocation to match.
+ a := ev.DetachAlloc()
+ a.AdjustBytesToTarget(ctx, int64(len(keyCopy)+len(valueCopy)))
+
if err := c.sink.EmitRow(
- ctx, topic,
- keyCopy, valueCopy, schemaTimestamp, mvccTimestamp, ev.DetachAlloc(),
+ ctx, topic, keyCopy, valueCopy, schemaTimestamp, mvccTimestamp, a,
); err != nil {
return err
}
@@ -240,3 +351,234 @@ func (c *kvEventToRowConsumer) ConsumeEvent(ctx context.Context, ev kvevent.Even
}
return nil
}
+
+// Close is a noop for the kvEventToRowConsumer because it
+// has no goroutines in flight.
+func (c *kvEventToRowConsumer) Close() error {
+ return nil
+}
+
+// Flush is a noop for the kvEventToRowConsumer because it does not buffer any events.
+func (c *kvEventToRowConsumer) Flush(ctx context.Context) error {
+ return nil
+}
+
+type parallelEventConsumer struct {
+ // g is a group used to manage worker goroutines.
+ g ctxgroup.Group
+
+ // hasher is used to shard keys into worker queues.
+ hasher hash.Hash32
+
+ metrics *Metrics
+
+ // doneCh is used to shut down all workers when
+ // parallelEventConsumer.Close() is called.
+ doneCh chan struct{}
+
+ // makeConsumer creates a single-threaded consumer
+ // which encodes and emits events.
+ makeConsumer func() (eventConsumer, error)
+
+ // numWorkers is the number of worker threads.
+ numWorkers int64
+ // workerCh stores the event buffer for each worker.
+ // It is a fixed size array with length numWorkers.
+ workerCh []chan kvevent.Event
+ // workerChSize is the maximum number of events a worker can buffer.
+ workerChSize int64
+
+ // spanFrontier stores the frontier for the aggregator
+ // that spawned this event consumer.
+ spanFrontier *span.Frontier
+
+ // termErr and termCh are used to save the first error that occurs
+ // in any worker and signal all workers to stop.
+ //
+ // inFlight, waiting, and flushCh are used to flush the sink.
+ //
+ // flushFrontier tracks the local frontier. It is set to the
+ // spanFrontier upon flushing. This guarantees that workers are
+ // not buffering events which have timestamps lower than the frontier.
+ mu struct {
+ syncutil.Mutex
+ termErr error
+
+ inFlight int
+ waiting bool
+ flushFrontier hlc.Timestamp
+ }
+ termCh chan struct{}
+ flushCh chan struct{}
+}
+
+var _ eventConsumer = (*parallelEventConsumer)(nil)
+
+func (c *parallelEventConsumer) ConsumeEvent(ctx context.Context, ev kvevent.Event) error {
+ startTime := timeutil.Now().UnixNano()
+ defer func() {
+ time := timeutil.Now().UnixNano()
+ c.metrics.ParallelConsumerConsumeNanos.Inc(time - startTime)
+ }()
+
+ bucket := c.getBucketForEvent(ev)
+
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-c.termCh:
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return c.mu.termErr
+ case c.workerCh[bucket] <- ev:
+ c.incInFlight()
+ return nil
+ }
+}
+
+// getBucketForEvent returns a deterministic value between [0,
+// parallelEventConsumer.numWorkers).
+//
+// This is used to ensure events of the same key get sent to the same worker.
+// Events of the same key are sent to the same worker so per-key ordering is
+// maintained.
+func (c *parallelEventConsumer) getBucketForEvent(ev kvevent.Event) int64 {
+ c.hasher.Reset()
+ c.hasher.Write(ev.KV().Key)
+ return int64(c.hasher.Sum32()) % c.numWorkers
+}
+
+func (c *parallelEventConsumer) startWorkers() error {
+ // Create the consumers in a separate loop so that returning
+ // in case of an error is simple and does not require
+ // shutting down goroutines.
+ consumers := make([]eventConsumer, c.numWorkers)
+ for i := int64(0); i < c.numWorkers; i++ {
+ consumer, err := c.makeConsumer()
+ if err != nil {
+ return err
+ }
+ consumers[i] = consumer
+ }
+
+ for i := int64(0); i < c.numWorkers; i++ {
+ c.workerCh[i] = make(chan kvevent.Event, c.workerChSize)
+
+ // c.g.GoCtx requires a func(context.Context), so
+ // a closure is used to pass the consumer and id to
+ // the worker.
+ id := i
+ consumer := consumers[i]
+ workerClosure := func(ctx2 context.Context) error {
+ return c.workerLoop(ctx2, consumer, id)
+ }
+ c.g.GoCtx(workerClosure)
+ }
+ return nil
+}
+
+func (c *parallelEventConsumer) workerLoop(
+ ctx context.Context, consumer eventConsumer, id int64,
+) error {
+ for {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-c.doneCh:
+ return nil
+ case <-c.termCh:
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return c.mu.termErr
+ case e := <-c.workerCh[id]:
+ if err := consumer.ConsumeEvent(ctx, e); err != nil {
+ return c.setWorkerError(err)
+ }
+ c.decInFlight()
+ }
+ }
+}
+
+func (c *parallelEventConsumer) incInFlight() {
+ c.mu.Lock()
+ c.mu.inFlight++
+ c.mu.Unlock()
+ c.metrics.ParallelConsumerInFlightEvents.Inc(1)
+}
+
+func (c *parallelEventConsumer) decInFlight() {
+ c.mu.Lock()
+ c.mu.inFlight--
+ c.metrics.ParallelConsumerInFlightEvents.Dec(1)
+ notifyFlush := c.mu.waiting && c.mu.inFlight == 0
+ c.mu.Unlock()
+
+ // If someone is waiting on a flush, signal to them
+ // that there are no more events.
+ if notifyFlush {
+ c.flushCh <- struct{}{}
+ }
+}
+
+func (c *parallelEventConsumer) setWorkerError(err error) error {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ if c.mu.termErr == nil {
+ c.mu.termErr = err
+ close(c.termCh)
+ }
+
+ return err
+}
+
+// Flush flushes the consumer by blocking until all events are consumed,
+// or until there is an error.
+func (c *parallelEventConsumer) Flush(ctx context.Context) error {
+ startTime := timeutil.Now().UnixNano()
+ defer func() {
+ time := timeutil.Now().UnixNano()
+ c.metrics.ParallelConsumerFlushNanos.Inc(time - startTime)
+ }()
+
+ needFlush := func() bool {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ if c.mu.inFlight > 0 {
+ c.mu.waiting = true
+ }
+ return c.mu.waiting
+ }
+
+ if !needFlush() {
+ return nil
+ }
+
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-c.termCh:
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return c.mu.termErr
+ case <-c.flushCh:
+ c.mu.Lock()
+ c.mu.waiting = false
+ c.mu.flushFrontier = c.spanFrontier.Frontier()
+ c.mu.Unlock()
+ return nil
+ }
+}
+
+func (c *parallelEventConsumer) Frontier() hlc.Timestamp {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return c.mu.flushFrontier
+}
+
+func (c *parallelEventConsumer) Close() error {
+ // Signal the done channel and wait for all workers to finish.
+ // If an error occurred, at least one worker will return an error, so
+ // it will be returned by c.g.Wait().
+ close(c.doneCh)
+ return c.g.Wait()
+}
diff --git a/pkg/ccl/changefeedccl/event_processing_test.go b/pkg/ccl/changefeedccl/event_processing_test.go
index b61ca883029b..63b66f3bcee7 100644
--- a/pkg/ccl/changefeedccl/event_processing_test.go
+++ b/pkg/ccl/changefeedccl/event_processing_test.go
@@ -9,12 +9,21 @@
package changefeedccl
import (
+ "math/rand"
"testing"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcevent"
+ "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvevent"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
+ "github.com/cockroachdb/cockroach/pkg/keys"
+ "github.com/cockroachdb/cockroach/pkg/roachpb"
+ "github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
+ "github.com/cockroachdb/cockroach/pkg/util/randutil"
+ "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -166,3 +175,73 @@ func TestTopicForEvent(t *testing.T) {
})
}
}
+
+// TestShardingByKey tests that the sharding function is deterministic and
+// maps keys within a range of [0, numWorkers).
+func TestShardingByKey(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+
+ rng, _ := randutil.NewTestRand()
+ p := parallelEventConsumer{numWorkers: 16, hasher: makeHasher()}
+
+ poolSize := 10000
+ eventPool := make([]kvevent.Event, poolSize)
+ for i := 0; i < poolSize; i++ {
+ eventPool[i] = makeKVEventKeyOnly(rng, 128)
+ }
+
+ seen := map[string]int64{}
+ for i := 0; i < poolSize; i++ {
+ ev := eventPool[i]
+
+ b := p.getBucketForEvent(ev)
+ key := getKeyFromKVEvent(ev)
+
+ assert.True(t, 0 <= b && b < 16)
+
+ if bucket, ok := seen[key]; ok {
+ assert.Equal(t, bucket, b)
+ }
+ seen[key] = b
+ }
+}
+
+func BenchmarkShardingByKey(b *testing.B) {
+ rng, _ := randutil.NewTestRand()
+ p := parallelEventConsumer{numWorkers: 32, hasher: makeHasher()}
+
+ poolSize := 1000
+ eventPool := make([]kvevent.Event, poolSize)
+ for i := 0; i < poolSize; i++ {
+ eventPool[i] = makeKVEventKeyOnly(rng, 2<<31-1)
+ }
+
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ p.getBucketForEvent(eventPool[rng.Intn(len(eventPool))])
+ }
+}
+
+func makeKVEventKeyOnly(rng *rand.Rand, upper int) kvevent.Event {
+ testTableID := 42
+
+ key, err := keyside.Encode(
+ keys.SystemSQLCodec.TablePrefix(uint32(testTableID)),
+ tree.NewDInt(tree.DInt(rng.Intn(upper))),
+ encoding.Ascending,
+ )
+ if err != nil {
+ panic(err)
+ }
+
+ return kvevent.MakeKVEvent(&roachpb.RangeFeedEvent{
+ Val: &roachpb.RangeFeedValue{
+ Key: key,
+ },
+ })
+}
+
+func getKeyFromKVEvent(ev kvevent.Event) string {
+ return ev.KV().Key.String()
+}
diff --git a/pkg/ccl/changefeedccl/kvevent/BUILD.bazel b/pkg/ccl/changefeedccl/kvevent/BUILD.bazel
index 2d9c2ce3a3ac..35052ab74d33 100644
--- a/pkg/ccl/changefeedccl/kvevent/BUILD.bazel
+++ b/pkg/ccl/changefeedccl/kvevent/BUILD.bazel
@@ -63,6 +63,7 @@ go_test(
"//pkg/util/quotapool",
"//pkg/util/randutil",
"//pkg/util/syncutil",
+ "@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
],
diff --git a/pkg/ccl/changefeedccl/kvevent/alloc.go b/pkg/ccl/changefeedccl/kvevent/alloc.go
index ba63a8b4d7b9..cf3566fb4630 100644
--- a/pkg/ccl/changefeedccl/kvevent/alloc.go
+++ b/pkg/ccl/changefeedccl/kvevent/alloc.go
@@ -46,6 +46,17 @@ func (a *Alloc) Release(ctx context.Context) {
a.clear()
}
+// AdjustBytesToTarget adjust byte allocation to the specified target.
+// Target bytes cannot be adjusted to the higher level than the current allocation.
+func (a *Alloc) AdjustBytesToTarget(ctx context.Context, targetBytes int64) {
+ if a.isZero() || targetBytes <= 0 || targetBytes >= a.bytes {
+ return
+ }
+ toRelease := a.bytes - targetBytes
+ a.bytes = targetBytes
+ a.ap.Release(ctx, toRelease, 0)
+}
+
// Bytes returns the size of this alloc in bytes.
func (a *Alloc) Bytes() int64 {
return a.bytes
diff --git a/pkg/ccl/changefeedccl/kvevent/alloc_test.go b/pkg/ccl/changefeedccl/kvevent/alloc_test.go
index 3d4d50294a96..9c899e403a24 100644
--- a/pkg/ccl/changefeedccl/kvevent/alloc_test.go
+++ b/pkg/ccl/changefeedccl/kvevent/alloc_test.go
@@ -17,6 +17,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
+ "github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)
@@ -36,7 +37,7 @@ func TestAllocMergeRandomized(t *testing.T) {
// Allocate N allocs from the P pools.
poolPerm := rand.Perm(P)
for i := range allocs {
- allocs[i] = pools[poolPerm[i%P]].alloc()
+ allocs[i] = pools[poolPerm[i%P]].alloc(1)
}
// Randomly merge the allocs together.
@@ -82,6 +83,26 @@ func TestAllocMergeRandomized(t *testing.T) {
}
}
+func TestAllocAdjust(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+ defer log.Scope(t).Close(t)
+
+ ctx := context.Background()
+ p := &testAllocPool{}
+ a := p.alloc(10)
+ require.EqualValues(t, 10, a.Bytes())
+ require.EqualValues(t, 1, a.Events())
+ a.AdjustBytesToTarget(ctx, 6)
+ require.EqualValues(t, 6, a.Bytes())
+ a.AdjustBytesToTarget(ctx, 7) // no-op
+ require.EqualValues(t, 6, a.Bytes())
+ a.AdjustBytesToTarget(ctx, -5) // no-op
+ require.EqualValues(t, 6, a.Bytes())
+ a.AdjustBytesToTarget(ctx, 1)
+ require.EqualValues(t, 1, a.Bytes())
+ a.Release(ctx)
+}
+
type testAllocPool struct {
syncutil.Mutex
n int64
@@ -91,17 +112,17 @@ type testAllocPool struct {
func (ap *testAllocPool) Release(ctx context.Context, bytes, entries int64) {
ap.Lock()
defer ap.Unlock()
- if ap.n == 0 {
- panic("can't release zero resources")
+ if ap.n < bytes {
+ panic(errors.AssertionFailedf("can't release %d bytes from zero resources", bytes))
}
ap.n -= bytes
}
-func (ap *testAllocPool) alloc() Alloc {
+func (ap *testAllocPool) alloc(bytes int64) Alloc {
ap.Lock()
defer ap.Unlock()
- ap.n++
- return TestingMakeAlloc(1, ap)
+ ap.n += bytes
+ return TestingMakeAlloc(bytes, ap)
}
func (ap *testAllocPool) getN() int {
diff --git a/pkg/ccl/changefeedccl/kvfeed/scanner.go b/pkg/ccl/changefeedccl/kvfeed/scanner.go
index 2c1f2195b7ed..d53f92336dcb 100644
--- a/pkg/ccl/changefeedccl/kvfeed/scanner.go
+++ b/pkg/ccl/changefeedccl/kvfeed/scanner.go
@@ -194,7 +194,7 @@ func (p *scanRequestScanner) exportSpan(
func getSpansToProcess(
ctx context.Context, ds *kvcoord.DistSender, targetSpans []roachpb.Span,
) ([]roachpb.Span, error) {
- ranges, err := allRangeSpans(ctx, ds, targetSpans)
+ ranges, err := AllRangeSpans(ctx, ds, targetSpans)
if err != nil {
return nil, err
}
@@ -261,7 +261,8 @@ func slurpScanResponse(
return nil
}
-func allRangeSpans(
+// AllRangeSpans returns the list of all ranges that for the specified list of spans.
+func AllRangeSpans(
ctx context.Context, ds *kvcoord.DistSender, spans []roachpb.Span,
) ([]roachpb.Span, error) {
diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go
index 08f60004c34e..cd625b0f60fd 100644
--- a/pkg/ccl/changefeedccl/metrics.go
+++ b/pkg/ccl/changefeedccl/metrics.go
@@ -359,6 +359,24 @@ var (
Measurement: "Replans",
Unit: metric.Unit_COUNT,
}
+ metaChangefeedEventConsumerFlushNanos = metric.Metadata{
+ Name: "changefeed.nprocs_flush_nanos",
+ Help: "Total time spent idle waiting for the parallel consumer to flush",
+ Measurement: "Nanoseconds",
+ Unit: metric.Unit_NANOSECONDS,
+ }
+ metaChangefeedEventConsumerConsumeNanos = metric.Metadata{
+ Name: "changefeed.nprocs_consume_event_nanos",
+ Help: "Total time spent waiting to add an event to the parallel consumer",
+ Measurement: "Nanoseconds",
+ Unit: metric.Unit_NANOSECONDS,
+ }
+ metaChangefeedEventConsumerInFlightEvents = metric.Metadata{
+ Name: "changefeed.nprocs_in_flight_count",
+ Help: "Number of buffered events in the parallel consumer",
+ Measurement: "Count of Events",
+ Unit: metric.Unit_COUNT,
+ }
)
func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
@@ -548,16 +566,19 @@ func (a *AggMetrics) getOrCreateScope(scope string) (*sliMetrics, error) {
// Metrics are for production monitoring of changefeeds.
type Metrics struct {
- AggMetrics *AggMetrics
- KVFeedMetrics kvevent.Metrics
- SchemaFeedMetrics schemafeed.Metrics
- Failures *metric.Counter
- ResolvedMessages *metric.Counter
- QueueTimeNanos *metric.Counter
- CheckpointHistNanos *metric.Histogram
- FrontierUpdates *metric.Counter
- ThrottleMetrics cdcutils.Metrics
- ReplanCount *metric.Counter
+ AggMetrics *AggMetrics
+ KVFeedMetrics kvevent.Metrics
+ SchemaFeedMetrics schemafeed.Metrics
+ Failures *metric.Counter
+ ResolvedMessages *metric.Counter
+ QueueTimeNanos *metric.Counter
+ CheckpointHistNanos *metric.Histogram
+ FrontierUpdates *metric.Counter
+ ThrottleMetrics cdcutils.Metrics
+ ReplanCount *metric.Counter
+ ParallelConsumerFlushNanos *metric.Counter
+ ParallelConsumerConsumeNanos *metric.Counter
+ ParallelConsumerInFlightEvents *metric.Gauge
mu struct {
syncutil.Mutex
@@ -578,16 +599,19 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) {
// MakeMetrics makes the metrics for changefeed monitoring.
func MakeMetrics(histogramWindow time.Duration) metric.Struct {
m := &Metrics{
- AggMetrics: newAggregateMetrics(histogramWindow),
- KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
- SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
- ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
- Failures: metric.NewCounter(metaChangefeedFailures),
- QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
- CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets),
- FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
- ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
- ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
+ AggMetrics: newAggregateMetrics(histogramWindow),
+ KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
+ SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
+ ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
+ Failures: metric.NewCounter(metaChangefeedFailures),
+ QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
+ CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets),
+ FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
+ ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
+ ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
+ ParallelConsumerFlushNanos: metric.NewCounter(metaChangefeedEventConsumerFlushNanos),
+ ParallelConsumerConsumeNanos: metric.NewCounter(metaChangefeedEventConsumerConsumeNanos),
+ ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents),
}
m.mu.resolved = make(map[int]hlc.Timestamp)
diff --git a/pkg/ccl/changefeedccl/sink.go b/pkg/ccl/changefeedccl/sink.go
index 758daaa5eeab..6b9d367d8a91 100644
--- a/pkg/ccl/changefeedccl/sink.go
+++ b/pkg/ccl/changefeedccl/sink.go
@@ -24,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
+ "github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)
@@ -536,3 +537,47 @@ func (n *nullSink) Close() error {
func (n *nullSink) Dial() error {
return nil
}
+
+// safeSink wraps an EventSink in a mutex so it's methods are
+// thread safe. It also has a beforeFlush hook which is called
+// at the beginning of safeSink.Flush().
+type safeSink struct {
+ syncutil.Mutex
+ beforeFlush func(ctx context.Context) error
+ wrapped EventSink
+}
+
+var _ EventSink = (*safeSink)(nil)
+
+func (s *safeSink) Dial() error {
+ s.Lock()
+ defer s.Unlock()
+ return s.wrapped.Dial()
+}
+
+func (s *safeSink) Close() error {
+ s.Lock()
+ defer s.Unlock()
+ return s.wrapped.Close()
+}
+
+func (s *safeSink) EmitRow(
+ ctx context.Context,
+ topic TopicDescriptor,
+ key, value []byte,
+ updated, mvcc hlc.Timestamp,
+ alloc kvevent.Alloc,
+) error {
+ s.Lock()
+ defer s.Unlock()
+ return s.wrapped.EmitRow(ctx, topic, key, value, updated, mvcc, alloc)
+}
+
+func (s *safeSink) Flush(ctx context.Context) error {
+ if err := s.beforeFlush(ctx); err != nil {
+ return err
+ }
+ s.Lock()
+ defer s.Unlock()
+ return s.wrapped.Flush(ctx)
+}
diff --git a/pkg/ccl/changefeedccl/sink_cloudstorage.go b/pkg/ccl/changefeedccl/sink_cloudstorage.go
index 60cbef569cb6..2176bcc13d21 100644
--- a/pkg/ccl/changefeedccl/sink_cloudstorage.go
+++ b/pkg/ccl/changefeedccl/sink_cloudstorage.go
@@ -10,7 +10,6 @@ package changefeedccl
import (
"bytes"
- "compress/gzip"
"context"
"crypto/rand"
"encoding/hex"
@@ -27,9 +26,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvevent"
"github.com/cockroachdb/cockroach/pkg/cloud"
"github.com/cockroachdb/cockroach/pkg/security/username"
+ "github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
+ "github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -289,7 +290,7 @@ type cloudStorageSink struct {
ext string
rowDelimiter []byte
- compression string
+ compression compressionAlgo
es cloud.ExternalStorage
@@ -307,9 +308,19 @@ type cloudStorageSink struct {
dataFilePartition string
prevFilename string
metrics metricsRecorder
+
+ asyncFlushActive bool
+ flushGroup ctxgroup.Group
+ asyncFlushCh chan flushRequest // channel for submitting flush requests.
+ asyncFlushTermCh chan struct{} // channel closed by async flusher to indicate an error
+ asyncFlushErr error // set by async flusher, prior to closing asyncFlushTermCh
}
-const sinkCompressionGzip = "gzip"
+type flushRequest struct {
+ file *cloudStorageSinkFile
+ dest string
+ flush chan struct{}
+}
var cloudStorageSinkIDAtomic int64
@@ -324,6 +335,16 @@ var partitionDateFormats = map[string]string{
}
var defaultPartitionFormat = partitionDateFormats["daily"]
+// flushQueueDepth puts a limit on how many flush requests
+// may be outstanding, before we block.
+// In reality, we will block much sooner than this limit due
+// to blocking buffer memory limits (in its default configuration);
+// We just want this setting to be sufficiently large, but not
+// so large as to have extremely large flush queues.
+// The default of 256, with the default file size of 16MB, gives us
+// a queue of 2.5GB of outstanding flush data.
+const flushQueueDepth = 256
+
func makeCloudStorageSink(
ctx context.Context,
u sinkURL,
@@ -367,9 +388,15 @@ func makeCloudStorageSink(
partitionFormat: defaultPartitionFormat,
timestampOracle: timestampOracle,
// TODO(dan,ajwerner): Use the jobs framework's session ID once that's available.
- jobSessionID: sessID,
- topicNamer: tn,
+ jobSessionID: sessID,
+ topicNamer: tn,
+ asyncFlushActive: enableAsyncFlush.Get(&settings.SV),
+ // TODO (yevgeniy): Consider adding ctx to Dial method instead.
+ flushGroup: ctxgroup.WithContext(ctx),
+ asyncFlushCh: make(chan flushRequest, flushQueueDepth),
+ asyncFlushTermCh: make(chan struct{}),
}
+ s.flushGroup.GoCtx(s.asyncFlusher)
if partitionFormat := u.consumeParam(changefeedbase.SinkParamPartitionFormat); partitionFormat != "" {
dateFormat, ok := partitionDateFormats[partitionFormat]
@@ -381,8 +408,7 @@ func makeCloudStorageSink(
}
if s.timestampOracle != nil {
- s.dataFileTs = cloudStorageFormatTime(s.timestampOracle.inclusiveLowerBoundTS())
- s.dataFilePartition = s.timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat)
+ s.setDataFileTimestamp()
}
switch encodingOpts.Format {
@@ -408,17 +434,17 @@ func makeCloudStorageSink(
changefeedbase.OptEnvelope, encodingOpts.Envelope)
}
- if !encodingOpts.KeyInValue {
+ if encodingOpts.Envelope != changefeedbase.OptEnvelopeBare && !encodingOpts.KeyInValue {
return nil, errors.Errorf(`this sink requires the WITH %s option`, changefeedbase.OptKeyInValue)
}
if codec := encodingOpts.Compression; codec != "" {
- if strings.EqualFold(codec, "gzip") {
- s.compression = sinkCompressionGzip
- s.ext = s.ext + ".gz"
- } else {
- return nil, errors.Errorf(`unsupported compression codec %q`, codec)
+ algo, ext, err := compressionFromString(codec)
+ if err != nil {
+ return nil, err
}
+ s.compression = algo
+ s.ext = s.ext + ext
}
// We make the external storage with a nil IOAccountingInterceptor since we
@@ -436,7 +462,7 @@ func makeCloudStorageSink(
func (s *cloudStorageSink) getOrCreateFile(
topic TopicDescriptor, eventMVCC hlc.Timestamp,
-) *cloudStorageSinkFile {
+) (*cloudStorageSinkFile, error) {
name, _ := s.topicNamer.Name(topic)
key := cloudStorageSinkKey{name, int64(topic.GetVersion())}
if item := s.files.Get(key); item != nil {
@@ -444,19 +470,23 @@ func (s *cloudStorageSink) getOrCreateFile(
if eventMVCC.Less(f.oldestMVCC) {
f.oldestMVCC = eventMVCC
}
- return f
+ return f, nil
}
f := &cloudStorageSinkFile{
created: timeutil.Now(),
cloudStorageSinkKey: key,
oldestMVCC: eventMVCC,
}
- switch s.compression {
- case sinkCompressionGzip:
- f.codec = gzip.NewWriter(&f.buf)
+
+ if s.compression.enabled() {
+ codec, err := newCompressionCodec(s.compression, &s.settings.SV, &f.buf)
+ if err != nil {
+ return nil, err
+ }
+ f.codec = codec
}
s.files.ReplaceOrInsert(f)
- return f
+ return f, nil
}
// EmitRow implements the Sink interface.
@@ -472,7 +502,10 @@ func (s *cloudStorageSink) EmitRow(
}
s.metrics.recordMessageSize(int64(len(key) + len(value)))
- file := s.getOrCreateFile(topic, mvcc)
+ file, err := s.getOrCreateFile(topic, mvcc)
+ if err != nil {
+ return err
+ }
file.alloc.Merge(&alloc)
if _, err := file.Write(value); err != nil {
@@ -506,6 +539,13 @@ func (s *cloudStorageSink) EmitResolvedTimestamp(
if err != nil {
return err
}
+
+ // Wait for previously issued async flush requests to complete
+ // before we write resolved time stamp file.
+ if err := s.waitAsyncFlush(ctx); err != nil {
+ return errors.Wrapf(err, "while emitting resolved timestamp")
+ }
+
// Don't need to copy payload because we never buffer it anywhere.
part := resolved.GoTime().Format(s.partitionFormat)
@@ -569,35 +609,68 @@ func (s *cloudStorageSink) Flush(ctx context.Context) error {
return err
}
s.files.Clear(true /* addNodesToFreeList */)
+ s.setDataFileTimestamp()
+ return s.waitAsyncFlush(ctx)
+}
+func (s *cloudStorageSink) setDataFileTimestamp() {
// Record the least resolved timestamp being tracked in the frontier as of this point,
// to use for naming files until the next `Flush()`. See comment on cloudStorageSink
// for an overview of the naming convention and proof of correctness.
- s.dataFileTs = cloudStorageFormatTime(s.timestampOracle.inclusiveLowerBoundTS())
- s.dataFilePartition = s.timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat)
- return nil
+ ts := s.timestampOracle.inclusiveLowerBoundTS()
+ s.dataFileTs = cloudStorageFormatTime(ts)
+ s.dataFilePartition = ts.GoTime().Format(s.partitionFormat)
}
-// file should not be used after flushing.
-func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSinkFile) error {
- defer file.alloc.Release(ctx)
+// enableAsyncFlush controls async flushing behavior for this sink.
+var enableAsyncFlush = settings.RegisterBoolSetting(
+ settings.TenantWritable,
+ "changefeed.cloudstorage.async_flush.enabled",
+ "enable async flushing",
+ true,
+)
- if file.rawSize == 0 {
- // This method shouldn't be called with an empty file, but be defensive
- // about not writing empty files anyway.
+// waitAsyncFlush waits until all async flushes complete.
+func (s *cloudStorageSink) waitAsyncFlush(ctx context.Context) error {
+ done := make(chan struct{})
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-s.asyncFlushTermCh:
+ return s.asyncFlushErr
+ case s.asyncFlushCh <- flushRequest{flush: done}:
+ }
+
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-s.asyncFlushTermCh:
+ return s.asyncFlushErr
+ case <-done:
return nil
}
+}
+
+var logQueueDepth = log.Every(30 * time.Second)
- if file.codec != nil {
- if err := file.codec.Close(); err != nil {
+// flushFile flushes file to the cloud storage.
+// file should not be used after flushing.
+func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSinkFile) error {
+ asyncFlushEnabled := enableAsyncFlush.Get(&s.settings.SV)
+ if s.asyncFlushActive && !asyncFlushEnabled {
+ // Async flush behavior was turned off -- drain any active flush requests
+ // before flushing this file.
+ if err := s.waitAsyncFlush(ctx); err != nil {
return err
}
}
+ s.asyncFlushActive = asyncFlushEnabled
// We use this monotonically increasing fileID to ensure correct ordering
// among files emitted at the same timestamp during the same job session.
fileID := s.fileID
s.fileID++
+
// Pad file ID to maintain lexical ordering among files from the same sink.
// Note that we use `-` here to delimit the filename because we want
// `%d.RESOLVED` files to lexicographically succeed data files that have the
@@ -609,11 +682,94 @@ func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSink
"precedes a file emitted before: %s", filename, s.prevFilename)
}
s.prevFilename = filename
- compressedBytes := file.buf.Len()
- if err := cloud.WriteFile(ctx, s.es, filepath.Join(s.dataFilePartition, filename), bytes.NewReader(file.buf.Bytes())); err != nil {
+ dest := filepath.Join(s.dataFilePartition, filename)
+
+ if !asyncFlushEnabled {
+ return file.flushToStorage(ctx, s.es, dest, s.metrics)
+ }
+
+ // Try to submit flush request, but produce warning message
+ // if we can't.
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-s.asyncFlushTermCh:
+ return s.asyncFlushErr
+ case s.asyncFlushCh <- flushRequest{file: file, dest: dest}:
+ return nil
+ default:
+ if logQueueDepth.ShouldLog() {
+ log.Infof(ctx, "changefeed flush queue is full; ~%d bytes to flush",
+ flushQueueDepth*s.targetMaxFileSize)
+ }
+ }
+
+ // Queue was full, block until it's not.
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-s.asyncFlushTermCh:
+ return s.asyncFlushErr
+ case s.asyncFlushCh <- flushRequest{file: file, dest: dest}:
+ return nil
+ }
+}
+
+func (s *cloudStorageSink) asyncFlusher(ctx context.Context) error {
+ defer close(s.asyncFlushTermCh)
+
+ for {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case req, ok := <-s.asyncFlushCh:
+ if !ok {
+ return nil // we're done
+ }
+
+ // handle flush request.
+ if req.flush != nil {
+ close(req.flush)
+ continue
+ }
+
+ // flush file to storage.
+ flushDone := s.metrics.recordFlushRequestCallback()
+ err := req.file.flushToStorage(ctx, s.es, req.dest, s.metrics)
+ flushDone()
+
+ if err != nil {
+ log.Errorf(ctx, "error flushing file to storage: %s", err)
+ s.asyncFlushErr = err
+ return err
+ }
+ }
+ }
+}
+
+// flushToStorage writes out file into external storage into 'dest'.
+func (f *cloudStorageSinkFile) flushToStorage(
+ ctx context.Context, es cloud.ExternalStorage, dest string, m metricsRecorder,
+) error {
+ defer f.alloc.Release(ctx)
+
+ if f.rawSize == 0 {
+ // This method shouldn't be called with an empty file, but be defensive
+ // about not writing empty files anyway.
+ return nil
+ }
+
+ if f.codec != nil {
+ if err := f.codec.Close(); err != nil {
+ return err
+ }
+ }
+
+ compressedBytes := f.buf.Len()
+ if err := cloud.WriteFile(ctx, es, dest, bytes.NewReader(f.buf.Bytes())); err != nil {
return err
}
- s.metrics.recordEmittedBatch(file.created, file.numMessages, file.oldestMVCC, file.rawSize, compressedBytes)
+ m.recordEmittedBatch(f.created, f.numMessages, f.oldestMVCC, f.rawSize, compressedBytes)
return nil
}
@@ -621,7 +777,10 @@ func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSink
// Close implements the Sink interface.
func (s *cloudStorageSink) Close() error {
s.files = nil
- return s.es.Close()
+ err := s.waitAsyncFlush(context.Background())
+ close(s.asyncFlushCh) // signal flusher to exit.
+ err = errors.CombineErrors(err, s.flushGroup.Wait())
+ return errors.CombineErrors(err, s.es.Close())
}
// Dial implements the Sink interface.
diff --git a/pkg/ccl/changefeedccl/sink_cloudstorage_test.go b/pkg/ccl/changefeedccl/sink_cloudstorage_test.go
index fba5f9bb4653..f545dc03853d 100644
--- a/pkg/ccl/changefeedccl/sink_cloudstorage_test.go
+++ b/pkg/ccl/changefeedccl/sink_cloudstorage_test.go
@@ -69,7 +69,7 @@ func TestCloudStorageSink(t *testing.T) {
defer log.Scope(t).Close(t)
ctx := context.Background()
- dir, dirCleanupFn := testutils.TempDir(t)
+ externalIODir, dirCleanupFn := testutils.TempDir(t)
defer dirCleanupFn()
gzipDecompress := func(t *testing.T, compressed []byte) []byte {
@@ -88,8 +88,12 @@ func TestCloudStorageSink(t *testing.T) {
return decompressed
}
- listLeafDirectories := func(root string) []string {
- absRoot := filepath.Join(dir, root)
+ testDir := func(t *testing.T) string {
+ return strings.ReplaceAll(t.Name(), "/", ";")
+ }
+
+ listLeafDirectories := func(t *testing.T) []string {
+ absRoot := filepath.Join(externalIODir, testDir(t))
var folders []string
@@ -126,7 +130,7 @@ func TestCloudStorageSink(t *testing.T) {
// slurpDir returns the contents of every file under root (relative to the
// temp dir created above), sorted by the name of the file.
- slurpDir := func(t *testing.T, root string) []string {
+ slurpDir := func(t *testing.T) []string {
var files []string
walkFn := func(path string, info os.FileInfo, err error) error {
if err != nil {
@@ -145,7 +149,7 @@ func TestCloudStorageSink(t *testing.T) {
files = append(files, string(file))
return nil
}
- absRoot := filepath.Join(dir, root)
+ absRoot := filepath.Join(externalIODir, testDir(t))
require.NoError(t, os.MkdirAll(absRoot, 0755))
require.NoError(t, filepath.Walk(absRoot, walkFn))
return files
@@ -154,7 +158,7 @@ func TestCloudStorageSink(t *testing.T) {
const unlimitedFileSize int64 = math.MaxInt64
var noKey []byte
settings := cluster.MakeTestingClusterSettings()
- settings.ExternalIODir = dir
+ settings.ExternalIODir = externalIODir
opts := changefeedbase.EncodingOptions{
Format: changefeedbase.OptFormatJSON,
Envelope: changefeedbase.OptEnvelopeWrapped,
@@ -162,7 +166,7 @@ func TestCloudStorageSink(t *testing.T) {
// NB: compression added in single-node subtest.
}
ts := func(i int64) hlc.Timestamp { return hlc.Timestamp{WallTime: i} }
- e, err := makeJSONEncoder(opts, changefeedbase.Targets{})
+ e, err := makeJSONEncoder(opts)
require.NoError(t, err)
clientFactory := blobs.TestBlobServiceClient(settings.ExternalIODir)
@@ -180,25 +184,35 @@ func TestCloudStorageSink(t *testing.T) {
user := username.RootUserName()
- sinkURI := func(dir string, maxFileSize int64) sinkURL {
- uri := `nodelocal://0/` + dir
+ sinkURI := func(t *testing.T, maxFileSize int64) sinkURL {
+ u, err := url.Parse(fmt.Sprintf("nodelocal://0/%s", testDir(t)))
+ require.NoError(t, err)
+ sink := sinkURL{URL: u}
if maxFileSize != unlimitedFileSize {
- uri += fmt.Sprintf("?%s=%d", changefeedbase.SinkParamFileSize, maxFileSize)
+ sink.addParam(changefeedbase.SinkParamFileSize, strconv.FormatInt(maxFileSize, 10))
}
- u, err := url.Parse(uri)
- require.NoError(t, err)
- return sinkURL{URL: u}
+ return sink
}
- t.Run(`golden`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing := func(t *testing.T, name string, testFn func(*testing.T)) {
+ t.Helper()
+ testutils.RunTrueAndFalse(t, name+"/asyncFlush", func(t *testing.T, enable bool) {
+ old := enableAsyncFlush.Get(&settings.SV)
+ enableAsyncFlush.Override(context.Background(), &settings.SV, enable)
+ defer enableAsyncFlush.Override(context.Background(), &settings.SV, old)
+ testFn(t)
+ })
+ }
+
+ testWithAndWithoutAsyncFlushing(t, `golden`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- sinkDir := `golden`
+
s, err := makeCloudStorageSink(
- ctx, sinkURI(sinkDir, unlimitedFileSize), 1, settings,
+ ctx, sinkURI(t, unlimitedFileSize), 1, settings,
opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -210,11 +224,11 @@ func TestCloudStorageSink(t *testing.T) {
require.Equal(t, []string{
"v1\n",
- }, slurpDir(t, sinkDir))
+ }, slurpDir(t))
require.NoError(t, s.EmitResolvedTimestamp(ctx, e, ts(5)))
resolvedFile, err := os.ReadFile(filepath.Join(
- dir, sinkDir, `1970-01-01`, `197001010000000000000050000000000.RESOLVED`))
+ externalIODir, testDir(t), `1970-01-01`, `197001010000000000000050000000000.RESOLVED`))
require.NoError(t, err)
require.Equal(t, `{"resolved":"5.0000000000"}`, string(resolvedFile))
})
@@ -225,7 +239,14 @@ func TestCloudStorageSink(t *testing.T) {
return forwarded
}
- t.Run(`single-node`, func(t *testing.T) {
+ stringOrDefault := func(s, ifEmpty string) string {
+ if len(s) == 0 {
+ return ifEmpty
+ }
+ return s
+ }
+
+ testWithAndWithoutAsyncFlushing(t, `single-node`, func(t *testing.T) {
before := opts.Compression
// Compression codecs include buffering that interferes with other tests,
// e.g. the bucketing test that configures very small flush sizes.
@@ -234,7 +255,7 @@ func TestCloudStorageSink(t *testing.T) {
}()
for _, compression := range []string{"", "gzip"} {
opts.Compression = compression
- t.Run("compress="+compression, func(t *testing.T) {
+ t.Run("compress="+stringOrDefault(compression, "none"), func(t *testing.T) {
t1 := makeTopic(`t1`)
t2 := makeTopic(`t2`)
@@ -242,9 +263,8 @@ func TestCloudStorageSink(t *testing.T) {
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `single-node` + compression
s, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 1, settings,
+ ctx, sinkURI(t, unlimitedFileSize), 1, settings,
opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -253,7 +273,7 @@ func TestCloudStorageSink(t *testing.T) {
// Empty flush emits no files.
require.NoError(t, s.Flush(ctx))
- require.Equal(t, []string(nil), slurpDir(t, dir))
+ require.Equal(t, []string(nil), slurpDir(t))
// Emitting rows and flushing should write them out in one file per table. Note
// the ordering among these two files is non deterministic as either of them could
@@ -268,19 +288,19 @@ func TestCloudStorageSink(t *testing.T) {
"v1\nv2\n",
"w1\n",
}
- actual := slurpDir(t, dir)
+ actual := slurpDir(t)
sort.Strings(actual)
require.Equal(t, expected, actual)
// Flushing with no new emits writes nothing new.
require.NoError(t, s.Flush(ctx))
- actual = slurpDir(t, dir)
+ actual = slurpDir(t)
sort.Strings(actual)
require.Equal(t, expected, actual)
// Without a flush, nothing new shows up.
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(`v3`), ts(3), ts(3), zeroAlloc))
- actual = slurpDir(t, dir)
+ actual = slurpDir(t)
sort.Strings(actual)
require.Equal(t, expected, actual)
@@ -290,7 +310,7 @@ func TestCloudStorageSink(t *testing.T) {
require.NoError(t, s.Flush(ctx))
require.Equal(t, []string{
"v3\n",
- }, slurpDir(t, dir)[2:])
+ }, slurpDir(t)[2:])
// Data from different versions of a table is put in different files, so that we
// can guarantee that all rows in any given file have the same schema.
@@ -306,7 +326,7 @@ func TestCloudStorageSink(t *testing.T) {
"v4\n",
"v5\n",
}
- actual = slurpDir(t, dir)
+ actual = slurpDir(t)
actual = actual[len(actual)-2:]
sort.Strings(actual)
require.Equal(t, expected, actual)
@@ -314,22 +334,20 @@ func TestCloudStorageSink(t *testing.T) {
}
})
- t.Run(`multi-node`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing(t, `multi-node`, func(t *testing.T) {
t1 := makeTopic(`t1`)
-
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `multi-node`
s1, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 1,
+ ctx, sinkURI(t, unlimitedFileSize), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
defer func() { require.NoError(t, s1.Close()) }()
s2, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 2,
+ ctx, sinkURI(t, unlimitedFileSize), 2,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
defer func() { require.NoError(t, s2.Close()) }()
@@ -353,19 +371,19 @@ func TestCloudStorageSink(t *testing.T) {
require.Equal(t, []string{
"v1\n",
"w1\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// If a node restarts then the entire distsql flow has to restart. If
// this happens before checkpointing, some data is written again but
// this is unavoidable.
s1R, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unbuffered), 1,
+ ctx, sinkURI(t, unbuffered), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
defer func() { require.NoError(t, s1R.Close()) }()
s2R, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unbuffered), 2,
+ ctx, sinkURI(t, unbuffered), 2,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -390,7 +408,7 @@ func TestCloudStorageSink(t *testing.T) {
"v1\n",
"w1\n",
"w1\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
})
// The jobs system can't always clean up perfectly after itself and so there
@@ -400,15 +418,14 @@ func TestCloudStorageSink(t *testing.T) {
//
// This test is also sufficient for verifying the behavior of a multi-node
// changefeed using this sink. Ditto job restarts.
- t.Run(`zombie`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing(t, `zombie`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `zombie`
s1, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 1,
+ ctx, sinkURI(t, unlimitedFileSize), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -416,7 +433,7 @@ func TestCloudStorageSink(t *testing.T) {
s1.(*cloudStorageSink).sinkID = 7 // Force a deterministic sinkID.
s1.(*cloudStorageSink).jobSessionID = "a" // Force deterministic job session ID.
s2, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 1,
+ ctx, sinkURI(t, unlimitedFileSize), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -441,19 +458,21 @@ func TestCloudStorageSink(t *testing.T) {
"v1\nv2\n",
"v3\n",
"v1\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
})
- t.Run(`bucketing`, func(t *testing.T) {
+ waitAsyncFlush := func(s Sink) error {
+ return s.(*cloudStorageSink).waitAsyncFlush(context.Background())
+ }
+ testWithAndWithoutAsyncFlushing(t, `bucketing`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `bucketing`
const targetMaxFileSize = 6
s, err := makeCloudStorageSink(
- ctx, sinkURI(dir, targetMaxFileSize), 1,
+ ctx, sinkURI(t, targetMaxFileSize), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -465,16 +484,17 @@ func TestCloudStorageSink(t *testing.T) {
for i := int64(1); i <= 5; i++ {
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(fmt.Sprintf(`v%d`, i)), ts(i), ts(i), zeroAlloc))
}
+ require.NoError(t, waitAsyncFlush(s))
require.Equal(t, []string{
"v1\nv2\nv3\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Flush then writes the rest.
require.NoError(t, s.Flush(ctx))
require.Equal(t, []string{
"v1\nv2\nv3\n",
"v4\nv5\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Forward the SpanFrontier here and trigger an empty flush to update
// the sink's `inclusiveLowerBoundTs`
@@ -487,11 +507,12 @@ func TestCloudStorageSink(t *testing.T) {
for i := int64(6); i < 10; i++ {
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(fmt.Sprintf(`v%d`, i)), ts(i), ts(i), zeroAlloc))
}
+ require.NoError(t, waitAsyncFlush(s))
require.Equal(t, []string{
"v1\nv2\nv3\n",
"v4\nv5\n",
"v6\nv7\nv8\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Resolved timestamps are periodically written. This happens
// asynchronously from a different node and can be given an earlier
@@ -507,7 +528,7 @@ func TestCloudStorageSink(t *testing.T) {
"v4\nv5\n",
`{"resolved":"5.0000000000"}`,
"v6\nv7\nv8\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Flush then writes the rest. Since we use the time of the EmitRow
// or EmitResolvedTimestamp calls to order files, the resolved timestamp
@@ -520,7 +541,7 @@ func TestCloudStorageSink(t *testing.T) {
`{"resolved":"5.0000000000"}`,
"v6\nv7\nv8\n",
"v9\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// A resolved timestamp emitted with ts > 5 should follow everything
// emitted thus far.
@@ -532,10 +553,10 @@ func TestCloudStorageSink(t *testing.T) {
"v6\nv7\nv8\n",
"v9\n",
`{"resolved":"6.0000000000"}`,
- }, slurpDir(t, dir))
+ }, slurpDir(t))
})
- t.Run(`partition-formatting`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing(t, `partition-formatting`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
const targetMaxFileSize = 6
@@ -550,7 +571,7 @@ func TestCloudStorageSink(t *testing.T) {
time.Date(2000, time.January, 2, 6, 1, 1, 0, time.UTC),
}
- for i, tc := range []struct {
+ for _, tc := range []struct {
format string
expectedFolders []string
}{
@@ -582,51 +603,50 @@ func TestCloudStorageSink(t *testing.T) {
},
},
} {
- t.Run(tc.format, func(t *testing.T) {
- sf, err := span.MakeFrontier(testSpan)
- require.NoError(t, err)
- timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
+ testWithAndWithoutAsyncFlushing(t, stringOrDefault(tc.format, "default"),
+ func(t *testing.T) {
+ sf, err := span.MakeFrontier(testSpan)
+ require.NoError(t, err)
+ timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := fmt.Sprintf(`partition-formatting-%d`, i)
+ sinkURIWithParam := sinkURI(t, targetMaxFileSize)
+ sinkURIWithParam.addParam(changefeedbase.SinkParamPartitionFormat, tc.format)
+ t.Logf("format=%s sinkgWithParam: %s", tc.format, sinkURIWithParam.String())
+ s, err := makeCloudStorageSink(
+ ctx, sinkURIWithParam, 1,
+ settings, opts, timestampOracle, externalStorageFromURI, user, nil,
+ )
- sinkURIWithParam := sinkURI(dir, targetMaxFileSize)
- sinkURIWithParam.addParam(changefeedbase.SinkParamPartitionFormat, tc.format)
- s, err := makeCloudStorageSink(
- ctx, sinkURIWithParam, 1,
- settings, opts, timestampOracle, externalStorageFromURI, user, nil,
- )
-
- require.NoError(t, err)
- defer func() { require.NoError(t, s.Close()) }()
- s.(*cloudStorageSink).sinkID = 7 // Force a deterministic sinkID.
+ require.NoError(t, err)
+ defer func() { require.NoError(t, s.Close()) }()
+ s.(*cloudStorageSink).sinkID = 7 // Force a deterministic sinkID.
- for i, timestamp := range timestamps {
- hlcTime := ts(timestamp.UnixNano())
+ for i, timestamp := range timestamps {
+ hlcTime := ts(timestamp.UnixNano())
- // Move the frontier and flush to update the dataFilePartition value
- _, err = sf.Forward(testSpan, hlcTime)
- require.NoError(t, err)
- require.NoError(t, s.Flush(ctx))
+ // Move the frontier and flush to update the dataFilePartition value
+ _, err = sf.Forward(testSpan, hlcTime)
+ require.NoError(t, err)
+ require.NoError(t, s.Flush(ctx))
- require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(fmt.Sprintf(`v%d`, i)), hlcTime, hlcTime, zeroAlloc))
- }
+ require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(fmt.Sprintf(`v%d`, i)), hlcTime, hlcTime, zeroAlloc))
+ }
- require.NoError(t, s.Flush(ctx)) // Flush the last file
- require.ElementsMatch(t, tc.expectedFolders, listLeafDirectories(dir))
- require.Equal(t, []string{"v0\n", "v1\n", "v2\n", "v3\n", "v4\n"}, slurpDir(t, dir))
- })
+ require.NoError(t, s.Flush(ctx)) // Flush the last file
+ require.ElementsMatch(t, tc.expectedFolders, listLeafDirectories(t))
+ require.Equal(t, []string{"v0\n", "v1\n", "v2\n", "v3\n", "v4\n"}, slurpDir(t))
+ })
}
})
- t.Run(`file-ordering`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing(t, `file-ordering`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `file-ordering`
s, err := makeCloudStorageSink(
- ctx, sinkURI(dir, unlimitedFileSize), 1,
+ ctx, sinkURI(t, unlimitedFileSize), 1,
settings, opts, timestampOracle, externalStorageFromURI, user, nil,
)
require.NoError(t, err)
@@ -663,7 +683,7 @@ func TestCloudStorageSink(t *testing.T) {
`{"resolved":"3.0000000000"}`,
"e3next\n",
`{"resolved":"4.0000000000"}`,
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Test that files with timestamp lower than the least resolved timestamp
// as of file creation time are ignored.
@@ -675,19 +695,18 @@ func TestCloudStorageSink(t *testing.T) {
`{"resolved":"3.0000000000"}`,
"e3next\n",
`{"resolved":"4.0000000000"}`,
- }, slurpDir(t, dir))
+ }, slurpDir(t))
})
- t.Run(`ordering-among-schema-versions`, func(t *testing.T) {
+ testWithAndWithoutAsyncFlushing(t, `ordering-among-schema-versions`, func(t *testing.T) {
t1 := makeTopic(`t1`)
testSpan := roachpb.Span{Key: []byte("a"), EndKey: []byte("b")}
sf, err := span.MakeFrontier(testSpan)
require.NoError(t, err)
timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf}
- dir := `ordering-among-schema-versions`
var targetMaxFileSize int64 = 10
s, err := makeCloudStorageSink(
- ctx, sinkURI(dir, targetMaxFileSize), 1, settings,
+ ctx, sinkURI(t, targetMaxFileSize), 1, settings,
opts, timestampOracle, externalStorageFromURI, user, nil)
require.NoError(t, err)
defer func() { require.NoError(t, s.Close()) }()
@@ -699,20 +718,22 @@ func TestCloudStorageSink(t *testing.T) {
// for the first file but not the second one.
t1.Version = 0
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(`trigger-flush-v1`), ts(1), ts(1), zeroAlloc))
+ require.NoError(t, waitAsyncFlush(s))
require.Equal(t, []string{
"v1\ntrigger-flush-v1\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Now make the file with the newer schema exceed its file size threshold and ensure
// that the file with the older schema is flushed (and ordered) before.
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(`v2`), ts(1), ts(1), zeroAlloc))
t1.Version = 1
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(`trigger-flush-v3`), ts(1), ts(1), zeroAlloc))
+ require.NoError(t, waitAsyncFlush(s))
require.Equal(t, []string{
"v1\ntrigger-flush-v1\n",
"v2\n",
"v3\ntrigger-flush-v3\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
// Calling `Flush()` on the sink should emit files in the order of their schema IDs.
require.NoError(t, s.EmitRow(ctx, t1, noKey, []byte(`w1`), ts(1), ts(1), zeroAlloc))
@@ -725,6 +746,6 @@ func TestCloudStorageSink(t *testing.T) {
"v3\ntrigger-flush-v3\n",
"x1\n",
"w1\n",
- }, slurpDir(t, dir))
+ }, slurpDir(t))
})
}
diff --git a/pkg/ccl/changefeedccl/sink_webhook.go b/pkg/ccl/changefeedccl/sink_webhook.go
index 06bb9a9a81f4..c8d7f723e7a5 100644
--- a/pkg/ccl/changefeedccl/sink_webhook.go
+++ b/pkg/ccl/changefeedccl/sink_webhook.go
@@ -295,7 +295,7 @@ func makeWebhookSink(
changefeedbase.OptEnvelope, encodingOpts.Envelope)
}
- if !encodingOpts.KeyInValue {
+ if encodingOpts.Envelope != changefeedbase.OptEnvelopeBare && !encodingOpts.KeyInValue {
return nil, errors.Errorf(`this sink requires the WITH %s option`, changefeedbase.OptKeyInValue)
}
diff --git a/pkg/ccl/changefeedccl/sink_webhook_test.go b/pkg/ccl/changefeedccl/sink_webhook_test.go
index 470abc49f973..85b8ce91ede7 100644
--- a/pkg/ccl/changefeedccl/sink_webhook_test.go
+++ b/pkg/ccl/changefeedccl/sink_webhook_test.go
@@ -122,7 +122,7 @@ func testSendAndReceiveRows(t *testing.T, sinkSrc Sink, sinkDest *cdctest.MockWe
opts, err := getGenericWebhookSinkOptions().GetEncodingOptions()
require.NoError(t, err)
- enc, err := makeJSONEncoder(opts, changefeedbase.Targets{})
+ enc, err := makeJSONEncoder(opts)
require.NoError(t, err)
// test a resolved timestamp entry
diff --git a/pkg/ccl/changefeedccl/testfeed_test.go b/pkg/ccl/changefeedccl/testfeed_test.go
index d6fbe9db9c4c..1d50d913c828 100644
--- a/pkg/ccl/changefeedccl/testfeed_test.go
+++ b/pkg/ccl/changefeedccl/testfeed_test.go
@@ -896,6 +896,16 @@ func (f *cloudFeedFactory) Feed(
if createStmt.SinkURI != nil {
return nil, errors.Errorf(`unexpected uri provided: "INTO %s"`, tree.AsString(createStmt.SinkURI))
}
+
+ if createStmt.Select != nil {
+ createStmt.Options = append(createStmt.Options,
+ // Normally, cloud storage requires key_in_value; but if we use bare envelope,
+ // this option is not required. However, we need it to make this
+ // test feed work -- so, set it.
+ tree.KVOption{Key: changefeedbase.OptKeyInValue},
+ )
+ }
+
feedDir := strconv.Itoa(f.feedIdx)
f.feedIdx++
sinkURI := `experimental-nodelocal://0/` + feedDir
@@ -921,6 +931,7 @@ func (f *cloudFeedFactory) Feed(
ss: ss,
seenTrackerMap: make(map[string]struct{}),
dir: feedDir,
+ isBare: createStmt.Select != nil,
}
if err := f.startFeedJob(c.jobFeed, createStmt.String(), args...); err != nil {
return nil, err
@@ -941,8 +952,9 @@ type cloudFeedEntry struct {
type cloudFeed struct {
*jobFeed
seenTrackerMap
- ss *sinkSynchronizer
- dir string
+ ss *sinkSynchronizer
+ dir string
+ isBare bool
resolved string
rows []cloudFeedEntry
@@ -977,23 +989,55 @@ func reformatJSON(j interface{}) ([]byte, error) {
return buf.Bytes(), nil
}
-// extractKeyFromJSONValue extracts the `WITH key_in_value` key from a `WITH
-// format=json, envelope=wrapped` value.
-func extractKeyFromJSONValue(wrapped []byte) (key []byte, value []byte, _ error) {
+func extractFieldFromJSONValue(
+ fieldName string, isBare bool, wrapped []byte,
+) (field gojson.RawMessage, value []byte, err error) {
parsed := make(map[string]gojson.RawMessage)
+
if err := gojson.Unmarshal(wrapped, &parsed); err != nil {
return nil, nil, err
}
- keyParsed := parsed[`key`]
- delete(parsed, `key`)
- var err error
- if key, err = reformatJSON(keyParsed); err != nil {
- return nil, nil, err
+ if isBare {
+ meta := make(map[string]gojson.RawMessage)
+ if metaVal, haveMeta := parsed[jsonMetaSentinel]; haveMeta {
+ if err := gojson.Unmarshal(metaVal, &meta); err != nil {
+ return nil, nil, err
+ }
+ field = meta[fieldName]
+ delete(meta, fieldName)
+ if len(meta) == 0 {
+ delete(parsed, jsonMetaSentinel)
+ } else {
+ if metaVal, err = reformatJSON(meta); err != nil {
+ return nil, nil, err
+ }
+ parsed[jsonMetaSentinel] = metaVal
+ }
+ }
+ } else {
+ field = parsed[fieldName]
+ delete(parsed, fieldName)
}
+
if value, err = reformatJSON(parsed); err != nil {
return nil, nil, err
}
+ return field, value, nil
+}
+
+// extractKeyFromJSONValue extracts the `WITH key_in_value` key from a `WITH
+// format=json, envelope=wrapped` value.
+func extractKeyFromJSONValue(isBare bool, wrapped []byte) (key []byte, value []byte, err error) {
+ var keyParsed gojson.RawMessage
+ keyParsed, value, err = extractFieldFromJSONValue("key", isBare, wrapped)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ if key, err = reformatJSON(keyParsed); err != nil {
+ return nil, nil, err
+ }
return key, value, nil
}
@@ -1027,7 +1071,7 @@ func (c *cloudFeed) Next() (*cdctest.TestFeedMessage, error) {
//
// TODO(dan): Leave the key in the value if the TestFeed user
// specifically requested it.
- if m.Key, m.Value, err = extractKeyFromJSONValue(m.Value); err != nil {
+ if m.Key, m.Value, err = extractKeyFromJSONValue(c.isBare, m.Value); err != nil {
return nil, err
}
if isNew := c.markSeen(m); !isNew {
@@ -1534,7 +1578,12 @@ func (f *webhookFeedFactory) Feed(create string, args ...interface{}) (cdctest.T
// required value
createStmt.Options = append(createStmt.Options, tree.KVOption{Key: changefeedbase.OptTopicInValue})
-
+ if createStmt.Select != nil {
+ // Normally, webhook requires key_in_value; but if we use bare envelope,
+ // this option is not required. However, we need it to make this
+ // test feed work -- so, set it.
+ createStmt.Options = append(createStmt.Options, tree.KVOption{Key: changefeedbase.OptKeyInValue})
+ }
var sinkDest *cdctest.MockWebhookSink
cert, _, err := cdctest.NewCACertBase64Encoded()
@@ -1580,6 +1629,7 @@ func (f *webhookFeedFactory) Feed(create string, args ...interface{}) (cdctest.T
jobFeed: newJobFeed(f.jobsTableConn(), wrapSink),
seenTrackerMap: make(map[string]struct{}),
ss: ss,
+ isBare: createStmt.Select != nil,
mockSink: sinkDest,
}
if err := f.startFeedJob(c.jobFeed, createStmt.String(), args...); err != nil {
@@ -1597,6 +1647,7 @@ type webhookFeed struct {
*jobFeed
seenTrackerMap
ss *sinkSynchronizer
+ isBare bool
mockSink *cdctest.MockWebhookSink
}
@@ -1619,17 +1670,15 @@ func isResolvedTimestamp(message []byte) (bool, error) {
// extractTopicFromJSONValue extracts the `WITH topic_in_value` topic from a `WITH
// format=json, envelope=wrapped` value.
-func extractTopicFromJSONValue(wrapped []byte) (topic string, value []byte, _ error) {
- parsedValue := make(map[string]gojson.RawMessage)
- if err := gojson.Unmarshal(wrapped, &parsedValue); err != nil {
- return "", nil, err
- }
- if err := gojson.Unmarshal(parsedValue[`topic`], &topic); err != nil {
+func extractTopicFromJSONValue(
+ isBare bool, wrapped []byte,
+) (topic string, value []byte, err error) {
+ var topicRaw gojson.RawMessage
+ topicRaw, value, err = extractFieldFromJSONValue("topic", isBare, wrapped)
+ if err != nil {
return "", nil, err
}
- delete(parsedValue, `topic`)
- var err error
- if value, err = reformatJSON(parsedValue); err != nil {
+ if err := gojson.Unmarshal(topicRaw, &topic); err != nil {
return "", nil, err
}
return topic, value, nil
@@ -1688,10 +1737,10 @@ func (f *webhookFeed) Next() (*cdctest.TestFeedMessage, error) {
if err != nil {
return nil, err
}
- if m.Key, m.Value, err = extractKeyFromJSONValue(wrappedValue); err != nil {
+ if m.Key, m.Value, err = extractKeyFromJSONValue(f.isBare, wrappedValue); err != nil {
return nil, err
}
- if m.Topic, m.Value, err = extractTopicFromJSONValue(m.Value); err != nil {
+ if m.Topic, m.Value, err = extractTopicFromJSONValue(f.isBare, m.Value); err != nil {
return nil, err
}
if isNew := f.markSeen(m); !isNew {
diff --git a/pkg/geo/bbox.go b/pkg/geo/bbox.go
index bc25ec25189d..b407bb80d37c 100644
--- a/pkg/geo/bbox.go
+++ b/pkg/geo/bbox.go
@@ -38,15 +38,21 @@ func NewCartesianBoundingBox() *CartesianBoundingBox {
// Repr is the string representation of the CartesianBoundingBox.
func (b *CartesianBoundingBox) Repr() string {
- // fmt.Sprintf with %f does not truncate leading zeroes, so use
- // FormatFloat instead.
- return fmt.Sprintf(
- "BOX(%s %s,%s %s)",
- strconv.FormatFloat(b.LoX, 'f', -1, 64),
- strconv.FormatFloat(b.LoY, 'f', -1, 64),
- strconv.FormatFloat(b.HiX, 'f', -1, 64),
- strconv.FormatFloat(b.HiY, 'f', -1, 64),
- )
+ return string(b.AppendFormat(nil))
+}
+
+// AppendFormat appends string representation of the CartesianBoundingBox
+// to the buffer, and returns modified buffer.
+func (b *CartesianBoundingBox) AppendFormat(buf []byte) []byte {
+ buf = append(buf, "BOX("...)
+ buf = strconv.AppendFloat(buf, b.LoX, 'f', -1, 64)
+ buf = append(buf, ' ')
+ buf = strconv.AppendFloat(buf, b.LoY, 'f', -1, 64)
+ buf = append(buf, ',')
+ buf = strconv.AppendFloat(buf, b.HiX, 'f', -1, 64)
+ buf = append(buf, ' ')
+ buf = strconv.AppendFloat(buf, b.HiY, 'f', -1, 64)
+ return append(buf, ')')
}
// ParseCartesianBoundingBox parses a box2d string into a bounding box.
diff --git a/pkg/geo/bbox_test.go b/pkg/geo/bbox_test.go
index 0ef94e9d7e6b..f099fb1e6bc7 100644
--- a/pkg/geo/bbox_test.go
+++ b/pkg/geo/bbox_test.go
@@ -14,6 +14,7 @@ import (
"fmt"
"math"
"strconv"
+ "strings"
"testing"
"github.com/cockroachdb/cockroach/pkg/geo/geopb"
@@ -63,6 +64,9 @@ func TestParseCartesianBoundingBox(t *testing.T) {
} else {
require.NoError(t, err)
require.Equal(t, tc.expected, ret)
+ // Test Repr/AppendFormat round trip.
+ require.Equal(t, strings.ToUpper(tc.s), ret.Repr())
+ require.Equal(t, strings.ToUpper(tc.s), string(ret.AppendFormat(nil)))
}
})
}
diff --git a/pkg/sql/sem/tree/BUILD.bazel b/pkg/sql/sem/tree/BUILD.bazel
index 67a57549fa6e..cdb40f077de8 100644
--- a/pkg/sql/sem/tree/BUILD.bazel
+++ b/pkg/sql/sem/tree/BUILD.bazel
@@ -182,6 +182,7 @@ go_test(
"function_name_test.go",
"indexed_vars_test.go",
"interval_test.go",
+ "json_test.go",
"main_test.go",
"name_part_test.go",
"name_resolution_test.go",
diff --git a/pkg/sql/sem/tree/datum.go b/pkg/sql/sem/tree/datum.go
index 676a0f125267..a6a48548a485 100644
--- a/pkg/sql/sem/tree/datum.go
+++ b/pkg/sql/sem/tree/datum.go
@@ -1782,7 +1782,11 @@ func (d *DUuid) Format(ctx *FmtCtx) {
if !bareStrings {
ctx.WriteByte('\'')
}
- ctx.WriteString(d.UUID.String())
+
+ buf := ctx.scratch[:uuid.RFC4122StrSize]
+ d.UUID.StringBytes(buf)
+ ctx.Write(buf)
+
if !bareStrings {
ctx.WriteByte('\'')
}
@@ -2344,7 +2348,7 @@ func (d *DTime) Format(ctx *FmtCtx) {
if !bareStrings {
ctx.WriteByte('\'')
}
- ctx.WriteString(timeofday.TimeOfDay(*d).String())
+ ctx.Write(timeofday.TimeOfDay(*d).AppendFormat(ctx.scratch[:0]))
if !bareStrings {
ctx.WriteByte('\'')
}
@@ -2520,7 +2524,7 @@ func (d *DTimeTZ) Format(ctx *FmtCtx) {
if !bareStrings {
ctx.WriteByte('\'')
}
- ctx.WriteString(d.TimeTZ.String())
+ ctx.Write(d.TimeTZ.AppendFormat(ctx.scratch[:0]))
if !bareStrings {
ctx.WriteByte('\'')
}
@@ -3603,7 +3607,7 @@ func (d *DBox2D) Format(ctx *FmtCtx) {
if !bareStrings {
ctx.WriteByte('\'')
}
- ctx.WriteString(d.CartesianBoundingBox.Repr())
+ ctx.Write(d.CartesianBoundingBox.AppendFormat(ctx.scratch[:0]))
if !bareStrings {
ctx.WriteByte('\'')
}
@@ -3731,10 +3735,10 @@ func AsJSON(
// without the T separator. This causes some compatibility problems
// with certain JSON consumers, so we'll use an alternate formatting
// path here to maintain consistency with PostgreSQL.
- return json.FromString(t.Time.In(loc).Format(time.RFC3339Nano)), nil
+ return json.FromString(formatTime(t.Time.In(loc), time.RFC3339Nano)), nil
case *DTimestamp:
// This is RFC3339Nano, but without the TZ fields.
- return json.FromString(t.UTC().Format("2006-01-02T15:04:05.999999999")), nil
+ return json.FromString(formatTime(t.UTC(), "2006-01-02T15:04:05.999999999")), nil
case *DDate, *DUuid, *DOid, *DInterval, *DBytes, *DIPAddr, *DTime, *DTimeTZ, *DBitArray, *DBox2D:
return json.FromString(AsStringWithFlags(t, FmtBareStrings, FmtDataConversionConfig(dcc))), nil
case *DGeometry:
@@ -3752,6 +3756,16 @@ func AsJSON(
}
}
+// formatTime formats time with specified layout.
+// TODO(yuzefovich): consider using this function in more places.
+func formatTime(t time.Time, layout string) string {
+ // We only need FmtCtx to access its buffer so
+ // that we get 0 amortized allocations.
+ ctx := NewFmtCtx(FmtSimple)
+ ctx.Write(t.AppendFormat(ctx.scratch[:0], layout))
+ return ctx.CloseAndGetString()
+}
+
// ResolvedType implements the TypedExpr interface.
func (*DJSON) ResolvedType() *types.T {
return types.Jsonb
@@ -5060,14 +5074,13 @@ func (d *DOid) CompareError(ctx CompareContext, other Datum) (int, error) {
// Format implements the Datum interface.
func (d *DOid) Format(ctx *FmtCtx) {
- s := strconv.FormatUint(uint64(d.Oid), 10)
if d.semanticType.Oid() == oid.T_oid || d.name == "" {
- ctx.WriteString(s)
+ ctx.Write(strconv.AppendUint(ctx.scratch[:0], uint64(d.Oid), 10))
} else if ctx.HasFlags(fmtDisambiguateDatumTypes) {
ctx.WriteString("crdb_internal.create_")
ctx.WriteString(d.semanticType.SQLStandardName())
ctx.WriteByte('(')
- ctx.WriteString(s)
+ ctx.Write(strconv.AppendUint(ctx.scratch[:0], uint64(d.Oid), 10))
ctx.WriteByte(',')
lexbase.EncodeSQLStringWithFlags(&ctx.Buffer, d.name, lexbase.EncNoFlags)
ctx.WriteByte(')')
diff --git a/pkg/sql/sem/tree/format.go b/pkg/sql/sem/tree/format.go
index 192354f711e6..457c106b20ef 100644
--- a/pkg/sql/sem/tree/format.go
+++ b/pkg/sql/sem/tree/format.go
@@ -20,6 +20,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util"
+ "github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)
@@ -273,6 +274,8 @@ type FmtCtx struct {
// indexedTypeFormatter is an optional interceptor for formatting
// IDTypeReferences differently than normal.
indexedTypeFormatter func(*FmtCtx, *OIDTypeReference)
+ // small scratch buffer to reduce allocations.
+ scratch [64]byte
}
// FmtCtxOption is an option to pass into NewFmtCtx.
@@ -673,3 +676,11 @@ func (ctx *FmtCtx) formatNodeMaybeMarkRedaction(n NodeFormatter) {
n.Format(ctx)
}
}
+
+func init() {
+ ctx := NewFmtCtx(FmtSimple)
+ if len(ctx.scratch) < uuid.RFC4122StrSize {
+ panic(errors.AssertionFailedf("FmtCtx scratch must be at least %d bytes", uuid.RFC4122StrSize))
+ }
+ ctx.Close()
+}
diff --git a/pkg/sql/sem/tree/json_test.go b/pkg/sql/sem/tree/json_test.go
new file mode 100644
index 000000000000..89b4d9d472fd
--- /dev/null
+++ b/pkg/sql/sem/tree/json_test.go
@@ -0,0 +1,116 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package tree_test
+
+import (
+ "math/rand"
+ "testing"
+ "time"
+
+ "github.com/cockroachdb/cockroach/pkg/sql/randgen"
+ "github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
+ "github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
+ "github.com/cockroachdb/cockroach/pkg/sql/types"
+ "github.com/cockroachdb/cockroach/pkg/util/randutil"
+)
+
+func BenchmarkAsJSON(b *testing.B) {
+ // Use fixed seed so that each invocation of this benchmark
+ // produces exactly the same types, and datums streams.
+ // This number can be changed to an arbitrary value; doing so
+ // would result in new types/datums being produced.
+ rng := randutil.NewTestRandWithSeed(-4365865412074131521)
+
+ const numDatums = 1024
+ makeDatums := func(typ *types.T) tree.Datums {
+ const allowNulls = true
+ res := make(tree.Datums, numDatums)
+ for i := 0; i < numDatums; i++ {
+ res[i] = randgen.RandDatum(rng, typ, allowNulls)
+ }
+ return res
+ }
+
+ bench := func(b *testing.B, typ *types.T) {
+ b.ReportAllocs()
+ b.StopTimer()
+ datums := makeDatums(typ)
+ b.StartTimer()
+
+ for i := 0; i < b.N; i++ {
+ _, err := tree.AsJSON(datums[i%numDatums], sessiondatapb.DataConversionConfig{}, time.UTC)
+ if err != nil {
+ b.Fatal(err)
+ }
+ }
+ }
+
+ for _, typ := range testTypes(rng) {
+ b.Run(typ.String(), func(b *testing.B) {
+ bench(b, typ)
+ })
+
+ if randgen.IsAllowedForArray(typ) {
+ typ = types.MakeArray(typ)
+ b.Run(typ.String(), func(b *testing.B) {
+ bench(b, typ)
+ })
+ }
+ }
+}
+
+// testTypes returns list of types to test against.
+func testTypes(rng *rand.Rand) (typs []*types.T) {
+ for _, typ := range randgen.SeedTypes {
+ switch typ {
+ case types.AnyTuple:
+ // Ignore AnyTuple -- it's not very interesting; we'll generate test tuples below.
+ case types.RegClass, types.RegNamespace, types.RegProc, types.RegProcedure, types.RegRole, types.RegType:
+ // Ignore a bunch of pseudo-OID types (just want regular OID).
+ case types.Geometry, types.Geography:
+ // Ignore geometry/geography: these types are insanely inefficient;
+ // AsJson(Geo) -> MarshalGeo -> go JSON bytes -> ParseJSON -> Go native -> json.JSON
+ // Benchmarking this generates too much noise.
+ // TODO(yevgeniy): fix this.
+ default:
+ typs = append(typs, typ)
+ }
+ }
+
+ // Add tuple types.
+ var tupleTypes []*types.T
+ makeTupleType := func() *types.T {
+ contents := make([]*types.T, rng.Intn(6)) // Up to 6 fields
+ for i := range contents {
+ contents[i] = randgen.RandTypeFromSlice(rng, typs)
+ }
+ candidateTuple := types.MakeTuple(contents)
+ // Ensure tuple type is unique.
+ for _, t := range tupleTypes {
+ if t.Equal(candidateTuple) {
+ return nil
+ }
+ }
+ tupleTypes = append(tupleTypes, candidateTuple)
+ return candidateTuple
+ }
+
+ const numTupleTypes = 5
+ for i := 0; i < numTupleTypes; i++ {
+ var typ *types.T
+ for typ == nil {
+ typ = makeTupleType()
+ }
+ typs = append(typs, typ)
+ }
+
+ return typs
+}
diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go
index 31e16b1fa60b..795e04d5a274 100644
--- a/pkg/ts/catalog/chart_catalog.go
+++ b/pkg/ts/catalog/chart_catalog.go
@@ -1447,6 +1447,24 @@ var charts = []sectionDescription{
"changefeed.replan_count",
},
},
+ {
+ Title: "Nprocs Consume Event Nanos",
+ Metrics: []string{
+ "changefeed.nprocs_consume_event_nanos",
+ },
+ },
+ {
+ Title: "Nprocs Flush Nanos",
+ Metrics: []string{
+ "changefeed.nprocs_flush_nanos",
+ },
+ },
+ {
+ Title: "Nprocs In Flight Count",
+ Metrics: []string{
+ "changefeed.nprocs_in_flight_count",
+ },
+ },
{
Title: "Flushed Bytes",
Metrics: []string{
diff --git a/pkg/util/json/BUILD.bazel b/pkg/util/json/BUILD.bazel
index 4074795a5336..3fd8879a5700 100644
--- a/pkg/util/json/BUILD.bazel
+++ b/pkg/util/json/BUILD.bazel
@@ -24,6 +24,7 @@ go_library(
"//pkg/sql/inverted",
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
+ "//pkg/util",
"//pkg/util/encoding",
"//pkg/util/syncutil",
"//pkg/util/unique",
diff --git a/pkg/util/json/json.go b/pkg/util/json/json.go
index cf9927385e43..b283fdf897d7 100644
--- a/pkg/util/json/json.go
+++ b/pkg/util/json/json.go
@@ -30,6 +30,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/inverted"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
+ "github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
uniq "github.com/cockroachdb/cockroach/pkg/util/unique"
"github.com/cockroachdb/errors"
@@ -390,6 +391,61 @@ func (b *ObjectBuilder) Build() JSON {
return jsonObject(sorter.pairs)
}
+// FixedKeysObjectBuilder is a JSON object builder that builds
+// an object for the specified fixed set of unique keys.
+// This object can be reused to build multiple instances of JSON object.
+type FixedKeysObjectBuilder struct {
+ pairs []jsonKeyValuePair
+ keyOrd map[string]int
+ updated util.FastIntSet
+}
+
+// NewFixedKeysObjectBuilder creates JSON object builder for the specified
+// set of fixed, unique keys.
+func NewFixedKeysObjectBuilder(keys []string) (*FixedKeysObjectBuilder, error) {
+ sort.Strings(keys)
+ pairs := make([]jsonKeyValuePair, len(keys))
+ keyOrd := make(map[string]int, len(keys))
+
+ for i, k := range keys {
+ if _, exists := keyOrd[k]; exists {
+ return nil, errors.AssertionFailedf("expected unique keys, found %v", keys)
+ }
+ pairs[i] = jsonKeyValuePair{k: jsonString(keys[i])}
+ keyOrd[k] = i
+ }
+
+ return &FixedKeysObjectBuilder{
+ pairs: pairs,
+ keyOrd: keyOrd,
+ }, nil
+}
+
+// Set sets the value for the specified key.
+// All previously configured keys must be set before calling Build.
+func (b *FixedKeysObjectBuilder) Set(k string, v JSON) error {
+ ord, ok := b.keyOrd[k]
+ if !ok {
+ return errors.AssertionFailedf("unknown key %s", k)
+ }
+
+ b.pairs[ord].v = v
+ b.updated.Add(ord)
+ return nil
+}
+
+// Build builds JSON object.
+func (b *FixedKeysObjectBuilder) Build() (JSON, error) {
+ if b.updated.Len() != len(b.pairs) {
+ return nil, errors.AssertionFailedf(
+ "expected all %d keys to be updated, %d updated",
+ len(b.pairs), b.updated.Len())
+ }
+ b.updated = util.FastIntSet{}
+ // Must copy b.pairs in case builder is reused.
+ return jsonObject(append([]jsonKeyValuePair(nil), b.pairs...)), nil
+}
+
// pairSorter sorts and uniqueifies JSON pairs. In order to keep
// the last one for pairs with the same key while sort.Sort is
// not stable, pairSorter uses []int orders to maintain order and
diff --git a/pkg/util/json/json_test.go b/pkg/util/json/json_test.go
index bf58df437f52..b2adeb3d5549 100644
--- a/pkg/util/json/json_test.go
+++ b/pkg/util/json/json_test.go
@@ -503,6 +503,17 @@ func TestNewObjectBuilderWithCounter(t *testing.T) {
}
func TestBuildJSONObject(t *testing.T) {
+ checkJSONObjectsEqual := func(t *testing.T, expected, found JSON) {
+ t.Helper()
+ c, err := found.Compare(expected)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if c != 0 {
+ t.Fatalf("expected %v to equal %v", found, expectError)
+ }
+ }
+
testCases := []struct {
input []string
}{
@@ -525,17 +536,52 @@ func TestBuildJSONObject(t *testing.T) {
t.Fatal(err)
}
result := b.Build()
- c, err := result.Compare(expectedResult)
- if err != nil {
- t.Fatal(err)
- }
- if c != 0 {
- t.Fatalf("expected %v to equal %v", result, expectedResult)
- }
+ checkJSONObjectsEqual(t, expectedResult, result)
+
+ t.Run("fixedKeys", func(t *testing.T) {
+ uniqueKeys := func() (keys []string) {
+ for k := range m {
+ keys = append(keys, k)
+ }
+ return keys
+ }()
+
+ fkb, err := NewFixedKeysObjectBuilder(uniqueKeys)
+ if err != nil {
+ t.Fatal(err)
+ }
+ for i := 0; i < 5; i++ {
+ for k, v := range m {
+ if err := fkb.Set(k, v.(JSON)); err != nil {
+ t.Fatal(err)
+ }
+ }
+ result, err := fkb.Build()
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkJSONObjectsEqual(t, expectedResult, result)
+ }
+ })
})
}
}
+func TestBuildFixedKeysJSONObjectErrors(t *testing.T) {
+ t.Run("require_unique_keys", func(t *testing.T) {
+ _, err := NewFixedKeysObjectBuilder([]string{"a", "b", "c", "a", "d"})
+ require.Error(t, err)
+ })
+ t.Run("requires_all_keys_set", func(t *testing.T) {
+ b, err := NewFixedKeysObjectBuilder([]string{"a", "b", "c"})
+ require.NoError(t, err)
+ require.NoError(t, b.Set("a", jsonNull{}))
+ require.NoError(t, b.Set("b", jsonNull{}))
+ _, err = b.Build()
+ require.Error(t, err)
+ })
+}
+
func jsonTestShorthand(s string) JSON {
j, err := ParseJSON(s)
if err != nil {
diff --git a/pkg/util/span/BUILD.bazel b/pkg/util/span/BUILD.bazel
index 1d616a8fe4e2..819fac7615a2 100644
--- a/pkg/util/span/BUILD.bazel
+++ b/pkg/util/span/BUILD.bazel
@@ -11,6 +11,7 @@ go_library(
"//pkg/roachpb",
"//pkg/util/hlc",
"//pkg/util/interval",
+ "//pkg/util/syncutil",
],
)
diff --git a/pkg/util/span/frontier.go b/pkg/util/span/frontier.go
index 7dc38531d2fa..7d2b7b2b9241 100644
--- a/pkg/util/span/frontier.go
+++ b/pkg/util/span/frontier.go
@@ -20,6 +20,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/interval"
+ "github.com/cockroachdb/cockroach/pkg/util/syncutil"
)
// frontierEntry represents a timestamped span. It is used as the nodes in both
@@ -91,6 +92,7 @@ func (h *frontierHeap) Pop() interface{} {
// Frontier tracks the minimum timestamp of a set of spans.
type Frontier struct {
+ syncutil.Mutex
// tree contains `*frontierEntry` items for the entire current tracked
// span set. Any tracked spans that have never been `Forward`ed will have a
// zero timestamp. If any entries needed to be split along a tracking
@@ -153,6 +155,12 @@ func (f *Frontier) AddSpansAt(startAt hlc.Timestamp, spans ...roachpb.Span) erro
// Frontier returns the minimum timestamp being tracked.
func (f *Frontier) Frontier() hlc.Timestamp {
+ f.Lock()
+ defer f.Unlock()
+ return f.frontierLocked()
+}
+
+func (f *Frontier) frontierLocked() hlc.Timestamp {
if f.minHeap.Len() == 0 {
return hlc.Timestamp{}
}
@@ -161,6 +169,8 @@ func (f *Frontier) Frontier() hlc.Timestamp {
// PeekFrontierSpan returns one of the spans at the Frontier.
func (f *Frontier) PeekFrontierSpan() roachpb.Span {
+ f.Lock()
+ defer f.Unlock()
if f.minHeap.Len() == 0 {
return roachpb.Span{}
}
@@ -176,11 +186,13 @@ func (f *Frontier) PeekFrontierSpan() roachpb.Span {
// set boundary). Similarly, an entry created by a previous Forward may be
// partially overlapped and have to be split into two entries.
func (f *Frontier) Forward(span roachpb.Span, ts hlc.Timestamp) (bool, error) {
- prevFrontier := f.Frontier()
+ f.Lock()
+ defer f.Unlock()
+ prevFrontier := f.frontierLocked()
if err := f.insert(span, ts); err != nil {
return false, err
}
- return prevFrontier.Less(f.Frontier()), nil
+ return prevFrontier.Less(f.frontierLocked()), nil
}
// extendRangeToTheLeft extends the range to the left of the range, provided those
@@ -390,6 +402,8 @@ type Operation func(roachpb.Span, hlc.Timestamp) (done OpResult)
// Entries invokes the given callback with the current timestamp for each
// component span in the tracked span set.
func (f *Frontier) Entries(fn Operation) {
+ f.Lock()
+ defer f.Unlock()
f.tree.Do(func(i interval.Interface) bool {
spe := i.(*frontierEntry)
return fn(spe.span, spe.ts).asBool()
@@ -415,6 +429,8 @@ func (f *Frontier) Entries(fn Operation) {
// Note: neither [a-b) nor [m, q) will be emitted since they fall outside the spans
// tracked by this frontier.
func (f *Frontier) SpanEntries(span roachpb.Span, op Operation) {
+ f.Lock()
+ defer f.Unlock()
todoRange := span.AsRange()
f.tree.DoMatching(func(i interval.Interface) bool {
@@ -440,6 +456,8 @@ func (f *Frontier) SpanEntries(span roachpb.Span, op Operation) {
// String implements Stringer.
func (f *Frontier) String() string {
+ f.Lock()
+ defer f.Unlock()
var buf strings.Builder
f.tree.Do(func(i interval.Interface) bool {
if buf.Len() != 0 {
diff --git a/pkg/util/strutil/BUILD.bazel b/pkg/util/strutil/BUILD.bazel
new file mode 100644
index 000000000000..3d6bf1ae2fe5
--- /dev/null
+++ b/pkg/util/strutil/BUILD.bazel
@@ -0,0 +1,19 @@
+load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "strutil",
+ srcs = ["util.go"],
+ importpath = "github.com/cockroachdb/cockroach/pkg/util/strutil",
+ visibility = ["//visibility:public"],
+)
+
+go_test(
+ name = "strutil_test",
+ srcs = ["util_test.go"],
+ args = ["-test.timeout=295s"],
+ embed = [":strutil"],
+ deps = ["@com_github_stretchr_testify//require"],
+)
+
+get_x_data(name = "get_x_data")
diff --git a/pkg/util/strutil/util.go b/pkg/util/strutil/util.go
new file mode 100644
index 000000000000..a1788a07b661
--- /dev/null
+++ b/pkg/util/strutil/util.go
@@ -0,0 +1,34 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package strutil
+
+import "strconv"
+
+// AppendInt appends the decimal form of x to b and returns the result.
+// If the decimal form is shorter than width, the result is padded with leading 0's.
+// If the decimal is longer than width, returns formatted decimal without
+// any truncation.
+func AppendInt(b []byte, x int, width int) []byte {
+ if x < 0 {
+ width--
+ x = -x
+ b = append(b, '-')
+ }
+
+ var scratch [16]byte
+ xb := strconv.AppendInt(scratch[:0], int64(x), 10)
+
+ // Add 0-padding.
+ for w := len(xb); w < width; w++ {
+ b = append(b, '0')
+ }
+ return append(b, xb...)
+}
diff --git a/pkg/util/strutil/util_test.go b/pkg/util/strutil/util_test.go
new file mode 100644
index 000000000000..8a94ccfe94f6
--- /dev/null
+++ b/pkg/util/strutil/util_test.go
@@ -0,0 +1,34 @@
+// Copyright 2022 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+package strutil
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestAppendInt(t *testing.T) {
+ for _, tc := range []struct {
+ n int
+ width int
+ fmt string
+ }{
+ {0, 3, "%03d"},
+ {5, 2, "%02d"},
+ {10, 1, "%01d"},
+ {-11, 4, "%04d"},
+ {1234, 6, "%06d"},
+ {-321, 2, "%02d"},
+ } {
+ require.Equal(t, fmt.Sprintf(tc.fmt, tc.n), string(AppendInt(nil, tc.n, tc.width)))
+ }
+}
diff --git a/pkg/util/timeofday/BUILD.bazel b/pkg/util/timeofday/BUILD.bazel
index a9424abf9319..0c723b337c29 100644
--- a/pkg/util/timeofday/BUILD.bazel
+++ b/pkg/util/timeofday/BUILD.bazel
@@ -8,6 +8,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/util/duration",
+ "//pkg/util/strutil",
"//pkg/util/timeutil",
],
)
diff --git a/pkg/util/timeofday/time_of_day.go b/pkg/util/timeofday/time_of_day.go
index d3de2de87c96..04694e4de4fb 100644
--- a/pkg/util/timeofday/time_of_day.go
+++ b/pkg/util/timeofday/time_of_day.go
@@ -11,12 +11,12 @@
package timeofday
import (
- "fmt"
+ "bytes"
"math/rand"
- "strings"
"time"
"github.com/cockroachdb/cockroach/pkg/util/duration"
+ "github.com/cockroachdb/cockroach/pkg/util/strutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)
@@ -52,12 +52,23 @@ func New(hour, min, sec, micro int) TimeOfDay {
}
func (t TimeOfDay) String() string {
+ return string(t.AppendFormat(nil))
+}
+
+// AppendFormat appends this TimeOfDay format to the specified buffer.
+func (t TimeOfDay) AppendFormat(buf []byte) []byte {
+ buf = strutil.AppendInt(buf, t.Hour(), 2)
+ buf = append(buf, ':')
+ buf = strutil.AppendInt(buf, t.Minute(), 2)
+ buf = append(buf, ':')
+ buf = strutil.AppendInt(buf, t.Second(), 2)
micros := t.Microsecond()
if micros > 0 {
- s := fmt.Sprintf("%02d:%02d:%02d.%06d", t.Hour(), t.Minute(), t.Second(), micros)
- return strings.TrimRight(s, "0")
+ buf = append(buf, '.')
+ buf = strutil.AppendInt(buf, micros, 6)
+ buf = bytes.TrimRight(buf, "0")
}
- return fmt.Sprintf("%02d:%02d:%02d", t.Hour(), t.Minute(), t.Second())
+ return buf
}
// FromInt constructs a TimeOfDay from an int64, representing microseconds since
diff --git a/pkg/util/timetz/BUILD.bazel b/pkg/util/timetz/BUILD.bazel
index 9d99cc9b2e8d..4482dbdf540d 100644
--- a/pkg/util/timetz/BUILD.bazel
+++ b/pkg/util/timetz/BUILD.bazel
@@ -9,6 +9,7 @@ go_library(
deps = [
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
+ "//pkg/util/strutil",
"//pkg/util/timeofday",
"//pkg/util/timeutil",
"//pkg/util/timeutil/pgdate",
diff --git a/pkg/util/timetz/timetz.go b/pkg/util/timetz/timetz.go
index 9969649fc190..71d62fcc48ba 100644
--- a/pkg/util/timetz/timetz.go
+++ b/pkg/util/timetz/timetz.go
@@ -11,12 +11,12 @@
package timetz
import (
- "fmt"
"regexp"
"time"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
+ "github.com/cockroachdb/cockroach/pkg/util/strutil"
"github.com/cockroachdb/cockroach/pkg/util/timeofday"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil/pgdate"
@@ -150,25 +150,33 @@ func ParseTimeTZ(
// String implements the Stringer interface.
func (t *TimeTZ) String() string {
+ return string(t.AppendFormat(nil))
+}
+
+// AppendFormat appends TimeTZ to the buffer, and returns modified buffer.
+func (t *TimeTZ) AppendFormat(buf []byte) []byte {
tTime := t.ToTime()
- timeComponent := tTime.Format("15:04:05.999999")
- // 24:00:00 gets returned as 00:00:00, which is incorrect.
if t.TimeOfDay == timeofday.Time2400 {
- timeComponent = "24:00:00"
+ // 24:00:00 gets returned as 00:00:00, which is incorrect.
+ buf = append(buf, "24:00:00"...)
+ } else {
+ buf = tTime.AppendFormat(buf, "15:04:05.999999")
}
- timeZoneComponent := tTime.Format("Z07:00:00")
- // If it is UTC, .Format converts it to "Z".
- // Fully expand this component.
+
if t.OffsetSecs == 0 {
- timeZoneComponent = "+00:00:00"
- }
- // Go's time.Format functionality does not work for offsets which
- // in the range -0s < offsetSecs < -60s, e.g. -22s offset prints as 00:00:-22.
- // Manually correct for this.
- if 0 < t.OffsetSecs && t.OffsetSecs < 60 {
- timeZoneComponent = fmt.Sprintf("-00:00:%02d", t.OffsetSecs)
+ // If it is UTC, .Format converts it to "Z".
+ // Fully expand this component.
+ buf = append(buf, "+00:00:00"...)
+ } else if 0 < t.OffsetSecs && t.OffsetSecs < 60 {
+ // Go's time.Format functionality does not work for offsets which
+ // in the range -0s < offsetSecs < -60s, e.g. -22s offset prints as 00:00:-22.
+ // Manually correct for this.
+ buf = append(buf, "-00:00:"...)
+ buf = strutil.AppendInt(buf, int(t.OffsetSecs), 2)
+ } else {
+ buf = tTime.AppendFormat(buf, "Z07:00:00")
}
- return timeComponent + timeZoneComponent
+ return buf
}
// ToTime converts a DTimeTZ to a time.Time, corrected to the given location.
diff --git a/pkg/util/uint128/uint128.go b/pkg/util/uint128/uint128.go
index e54e24cac611..86fff00e7b62 100644
--- a/pkg/util/uint128/uint128.go
+++ b/pkg/util/uint128/uint128.go
@@ -30,6 +30,13 @@ func (u Uint128) GetBytes() []byte {
return buf
}
+// AppendBytes appends big-endian byte representation to the
+// buffer and returns the buffer.
+func (u Uint128) AppendBytes(buf []byte) []byte {
+ buf = binary.BigEndian.AppendUint64(buf, u.Hi)
+ return binary.BigEndian.AppendUint64(buf, u.Lo)
+}
+
// String returns a hexadecimal string representation.
func (u Uint128) String() string {
return hex.EncodeToString(u.GetBytes())
diff --git a/pkg/util/uuid/uuid.go b/pkg/util/uuid/uuid.go
index f161a6b72b87..fd0951511ac4 100644
--- a/pkg/util/uuid/uuid.go
+++ b/pkg/util/uuid/uuid.go
@@ -30,6 +30,9 @@ import (
// Size of a UUID in bytes.
const Size = 16
+// RFC4122StrSize of a size of the RFC-4122 string representation of UUID.
+const RFC4122StrSize = 36
+
// Bytes represents a byte slice which is intended to be interpreted as a binary
// encoding of a UUID.
type Bytes []byte
@@ -144,7 +147,7 @@ func (u *UUID) bytesMut() []byte {
// String returns a canonical RFC-4122 string representation of the UUID:
// xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx.
func (u UUID) String() string {
- buf := make([]byte, 36)
+ buf := make([]byte, RFC4122StrSize)
u.StringBytes(buf)
return string(buf)
}
@@ -152,7 +155,7 @@ func (u UUID) String() string {
// StringBytes writes the result of String directly into a buffer, which must
// have a length of at least 36.
func (u UUID) StringBytes(buf []byte) {
- _ = buf[:36]
+ _ = buf[:RFC4122StrSize]
hex.Encode(buf[0:8], u[0:4])
buf[8] = '-'
hex.Encode(buf[9:13], u[4:6])