From 343335ada6d05016b26dc20c2aa1bcebe71d351d Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Wed, 15 Jun 2022 15:40:57 -0400 Subject: [PATCH 1/2] builtins: add pg_backend_pid() Release note (sql change): Updated the pg_backend_pid() builtin function so it matches with the data in the query cancellation key created during session initialization. The function is just for compatibility, and it does not return a real process ID. --- docs/generated/sql/functions.md | 2 ++ pkg/sql/conn_executor.go | 1 + .../pgwire/pgwirecancel/backend_key_data.go | 7 ++++ pkg/sql/sem/builtins/pg_builtins.go | 10 ++++-- pkg/sql/sem/eval/BUILD.bazel | 1 + pkg/sql/sem/eval/context.go | 5 +++ pkg/testutils/pgtest/pgtest.go | 34 +++++++++++++++++++ 7 files changed, 57 insertions(+), 3 deletions(-) diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 53728b7ed912..7e68f34673fa 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -3419,6 +3419,8 @@ A write probe will effectively probe reads as well.

oid(int: int) → oid

Converts an integer to an OID.

+pg_backend_pid() → int

Returns a numerical ID attached to this session. This ID is part of the query cancellation key used by the wire protocol. This function was only added for compatibility, and unlike in Postgres, thereturned value does not correspond to a real process ID.

+
pg_collation_for(str: anyelement) → string

Returns the collation of the argument

pg_column_is_updatable(reloid: oid, attnum: int2, include_triggers: bool) → bool

Returns whether the given column can be updated.

diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index a5c13e97649e..637289efc691 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -2674,6 +2674,7 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo RangeProber: p.execCfg.RangeProber, StmtDiagnosticsRequestInserter: ex.server.cfg.StmtDiagnosticsRecorder.InsertRequest, CatalogBuiltins: &p.evalCatalogBuiltins, + QueryCancelKey: ex.queryCancelKey, }, Tracing: &ex.sessionTracing, MemMetrics: &ex.memMetrics, diff --git a/pkg/sql/pgwire/pgwirecancel/backend_key_data.go b/pkg/sql/pgwire/pgwirecancel/backend_key_data.go index 82b3ac4272ee..1b151d9bc2dc 100644 --- a/pkg/sql/pgwire/pgwirecancel/backend_key_data.go +++ b/pkg/sql/pgwire/pgwirecancel/backend_key_data.go @@ -70,5 +70,12 @@ func (b BackendKeyData) GetSQLInstanceID() base.SQLInstanceID { bits = bits &^ leadingBitMask // Use the upper 32 bits as the sqlInstanceID. return base.SQLInstanceID(bits >> 32) +} + +// GetPGBackendPID returns the upper 32 bits of this BackendKeyData. In Postgres, +// this is the process ID, but we expose it only for compatibility. +func (b BackendKeyData) GetPGBackendPID() uint32 { + bits := uint64(b) + return uint32(bits >> 32) } diff --git a/pkg/sql/sem/builtins/pg_builtins.go b/pkg/sql/sem/builtins/pg_builtins.go index e6c6e90813c1..5aa8c26696c0 100644 --- a/pkg/sql/sem/builtins/pg_builtins.go +++ b/pkg/sql/sem/builtins/pg_builtins.go @@ -540,10 +540,14 @@ var pgBuiltins = map[string]builtinDefinition{ tree.Overload{ Types: tree.ArgTypes{}, ReturnType: tree.FixedReturnType(types.Int), - Fn: func(_ *eval.Context, _ tree.Datums) (tree.Datum, error) { - return tree.NewDInt(-1), nil + Fn: func(ctx *eval.Context, _ tree.Datums) (tree.Datum, error) { + pid := ctx.QueryCancelKey.GetPGBackendPID() + return tree.NewDInt(tree.DInt(pid)), nil }, - Info: notUsableInfo, + Info: "Returns a numerical ID attached to this session. This ID is " + + "part of the query cancellation key used by the wire protocol. This " + + "function was only added for compatibility, and unlike in Postgres, the" + + "returned value does not correspond to a real process ID.", Volatility: volatility.Stable, }, ), diff --git a/pkg/sql/sem/eval/BUILD.bazel b/pkg/sql/sem/eval/BUILD.bazel index 7b20225264f0..adca2fb4ca0f 100644 --- a/pkg/sql/sem/eval/BUILD.bazel +++ b/pkg/sql/sem/eval/BUILD.bazel @@ -50,6 +50,7 @@ go_library( "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/sql/pgwire/pgnotice", + "//pkg/sql/pgwire/pgwirecancel", "//pkg/sql/privilege", "//pkg/sql/roleoption", "//pkg/sql/sem/cast", diff --git a/pkg/sql/sem/eval/context.go b/pkg/sql/sem/eval/context.go index 26eff83ebd06..da12a3c3df18 100644 --- a/pkg/sql/sem/eval/context.go +++ b/pkg/sql/sem/eval/context.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgwirecancel" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" "github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb" @@ -211,6 +212,10 @@ type Context struct { // CatalogBuiltins is used by various builtins which depend on looking up // catalog information. Unlike the Planner, it is available in DistSQL. CatalogBuiltins CatalogBuiltins + + // QueryCancelKey is the key used by the pgwire protocol to cancel the + // query currently running in this session. + QueryCancelKey pgwirecancel.BackendKeyData } var _ tree.ParseTimeContext = &Context{} diff --git a/pkg/testutils/pgtest/pgtest.go b/pkg/testutils/pgtest/pgtest.go index ddc414dc1294..b22a41b00a9b 100644 --- a/pkg/testutils/pgtest/pgtest.go +++ b/pkg/testutils/pgtest/pgtest.go @@ -18,6 +18,7 @@ import ( "fmt" "net" "reflect" + "strconv" "strings" "testing" @@ -78,11 +79,44 @@ func NewPGTest(ctx context.Context, addr, user string) (*PGTest, error) { if backendKeyData == nil { return nil, errors.Errorf("did not receive BackendKeyData") } + if err := checkPGBackendPID(p, backendKeyData); err != nil { + return nil, err + } + p.isCockroachDB = foundCrdb success = err == nil return p, err } +func checkPGBackendPID(p *PGTest, backendKeyData *pgproto3.BackendKeyData) error { + if err := p.fe.Send(&pgproto3.Query{ + String: "SELECT pg_backend_pid();", + }); err != nil { + return errors.Wrap(err, "fetching pg_backend_pid") + } + msgs, err := p.Until(false /* keepErrMsg */, &pgproto3.ReadyForQuery{}) + if err != nil { + return errors.Wrap(err, "fetching pg_backend_pid") + } + matched := false + for _, msg := range msgs { + if d, ok := msg.(*pgproto3.DataRow); ok { + pid, err := strconv.Atoi(string(d.Values[0])) + if err != nil { + return errors.Wrap(err, "parsing pg_backend_pid") + } + if uint32(pid) != backendKeyData.ProcessID { + return errors.Errorf("wrong pg_backend_pid; wanted %d, got %d", backendKeyData.ProcessID, pid) + } + matched = true + } + } + if !matched { + return errors.Errorf("could not retrieve pg_backend_pid") + } + return nil +} + // Close sends a Terminate message and closes the connection. func (p *PGTest) Close() error { defer p.conn.Close() From 04be3bca9e201a2503c298c4d96494f483150abc Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 15 Jun 2022 09:12:49 +0000 Subject: [PATCH 2/2] batcheval: allow writing tombstones in `AddSSTable` This patch allows writing MVCC point tombstones in `AddSSTable`, which will be needed by cluster-to-cluster replication. Previously, SSTs containing tombstones could error or be silently accepted. The version gate `AddSSTableTombstones` has been added to allow clients to rely on this behavior. Note that `AddSSTable` has never made any attempt at correctly calculating `MVCCStats.GCBytesAge` when shadowing existing data, and this patch does not attempt to either. This will be addressed later. Release note: None --- .../settings/settings-for-tenants.txt | 2 +- docs/generated/settings/settings.html | 2 +- pkg/clusterversion/cockroach_versions.go | 7 + pkg/clusterversion/key_string.go | 5 +- pkg/kv/kvserver/batcheval/cmd_add_sstable.go | 19 +-- .../batcheval/cmd_add_sstable_test.go | 143 ++++++++++-------- pkg/roachpb/api.proto | 9 +- pkg/storage/sst.go | 17 ++- 8 files changed, 113 insertions(+), 91 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 2c239f57fa8c..dad2d8c8f11a 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -281,4 +281,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collector string the address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -version version 22.1-18 set the active cluster version in the format '.' +version version 22.1-20 set the active cluster version in the format '.' diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 1a8892748f39..bd41c08e3c9b 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -212,6 +212,6 @@ trace.opentelemetry.collectorstringaddress of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as :. If no port is specified, 4317 will be used. trace.span_registry.enabledbooleantrueif set, ongoing traces can be seen at https:///#/debug/tracez trace.zipkin.collectorstringthe address of a Zipkin instance to receive traces, as :. If no port is specified, 9411 will be used. -versionversion22.1-18set the active cluster version in the format '.' +versionversion22.1-20set the active cluster version in the format '.' diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index 4fc346f197c2..d414b2571e33 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -377,6 +377,9 @@ const ( // probabilistically collects stmt bundles, controlled by the user provided // sampling rate. SampledStmtDiagReqs + // AddSSTableTombstones allows writing MVCC point tombstones via AddSSTable. + // Previously, SSTs containing these could error. + AddSSTableTombstones // ************************************************* // Step (1): Add new versions here. @@ -666,6 +669,10 @@ var versionsSingleton = keyedVersions{ Key: SampledStmtDiagReqs, Version: roachpb.Version{Major: 22, Minor: 1, Internal: 18}, }, + { + Key: AddSSTableTombstones, + Version: roachpb.Version{Major: 22, Minor: 1, Internal: 20}, + }, // ************************************************* // Step (2): Add new versions here. diff --git a/pkg/clusterversion/key_string.go b/pkg/clusterversion/key_string.go index f583164116bb..29f2e5f723b6 100644 --- a/pkg/clusterversion/key_string.go +++ b/pkg/clusterversion/key_string.go @@ -71,11 +71,12 @@ func _() { _ = x[MVCCRangeTombstones-60] _ = x[UpgradeSequenceToBeReferencedByID-61] _ = x[SampledStmtDiagReqs-62] + _ = x[AddSSTableTombstones-63] } -const _Key_name = "V21_2Start22_1TargetBytesAvoidExcessAvoidDrainingNamesDrainingNamesMigrationTraceIDDoesntImplyStructuredRecordingAlterSystemTableStatisticsAddAvgSizeColMVCCAddSSTableInsertPublicSchemaNamespaceEntryOnRestoreUnsplitRangesInAsyncGCJobsValidateGrantOptionPebbleFormatBlockPropertyCollectorProbeRequestSelectRPCsTakeTracingInfoInbandPreSeedTenantSpanConfigsSeedTenantSpanConfigsPublicSchemasWithDescriptorsEnsureSpanConfigReconciliationEnsureSpanConfigSubscriptionEnableSpanConfigStoreScanWholeRowsSCRAMAuthenticationUnsafeLossOfQuorumRecoveryRangeLogAlterSystemProtectedTimestampAddColumnEnableProtectedTimestampsForTenantDeleteCommentsWithDroppedIndexesRemoveIncompatibleDatabasePrivilegesAddRaftAppliedIndexTermMigrationPostAddRaftAppliedIndexTermMigrationDontProposeWriteTimestampForLeaseTransfersTenantSettingsTableEnablePebbleFormatVersionBlockPropertiesDisableSystemConfigGossipTriggerMVCCIndexBackfillerEnableLeaseHolderRemovalBackupResolutionInJobLooselyCoupledRaftLogTruncationChangefeedIdlenessBackupDoesNotOverwriteLatestAndCheckpointEnableDeclarativeSchemaChangerRowLevelTTLPebbleFormatSplitUserKeysMarkedIncrementalBackupSubdirDateStyleIntervalStyleCastRewriteEnableNewStoreRebalancerClusterLocksVirtualTableAutoStatsTableSettingsForecastStatsSuperRegionsEnableNewChangefeedOptionsSpanCountTablePreSeedSpanCountTableSeedSpanCountTableV22_1Start22_2LocalTimestampsEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeysTrigramInvertedIndexesRemoveGrantPrivilegeMVCCRangeTombstonesUpgradeSequenceToBeReferencedByIDSampledStmtDiagReqs" +const _Key_name = "V21_2Start22_1TargetBytesAvoidExcessAvoidDrainingNamesDrainingNamesMigrationTraceIDDoesntImplyStructuredRecordingAlterSystemTableStatisticsAddAvgSizeColMVCCAddSSTableInsertPublicSchemaNamespaceEntryOnRestoreUnsplitRangesInAsyncGCJobsValidateGrantOptionPebbleFormatBlockPropertyCollectorProbeRequestSelectRPCsTakeTracingInfoInbandPreSeedTenantSpanConfigsSeedTenantSpanConfigsPublicSchemasWithDescriptorsEnsureSpanConfigReconciliationEnsureSpanConfigSubscriptionEnableSpanConfigStoreScanWholeRowsSCRAMAuthenticationUnsafeLossOfQuorumRecoveryRangeLogAlterSystemProtectedTimestampAddColumnEnableProtectedTimestampsForTenantDeleteCommentsWithDroppedIndexesRemoveIncompatibleDatabasePrivilegesAddRaftAppliedIndexTermMigrationPostAddRaftAppliedIndexTermMigrationDontProposeWriteTimestampForLeaseTransfersTenantSettingsTableEnablePebbleFormatVersionBlockPropertiesDisableSystemConfigGossipTriggerMVCCIndexBackfillerEnableLeaseHolderRemovalBackupResolutionInJobLooselyCoupledRaftLogTruncationChangefeedIdlenessBackupDoesNotOverwriteLatestAndCheckpointEnableDeclarativeSchemaChangerRowLevelTTLPebbleFormatSplitUserKeysMarkedIncrementalBackupSubdirDateStyleIntervalStyleCastRewriteEnableNewStoreRebalancerClusterLocksVirtualTableAutoStatsTableSettingsForecastStatsSuperRegionsEnableNewChangefeedOptionsSpanCountTablePreSeedSpanCountTableSeedSpanCountTableV22_1Start22_2LocalTimestampsEnsurePebbleFormatVersionRangeKeysEnablePebbleFormatVersionRangeKeysTrigramInvertedIndexesRemoveGrantPrivilegeMVCCRangeTombstonesUpgradeSequenceToBeReferencedByIDSampledStmtDiagReqsAddSSTableTombstones" -var _Key_index = [...]uint16{0, 5, 14, 36, 54, 76, 113, 152, 166, 207, 233, 252, 286, 298, 329, 353, 374, 402, 432, 460, 481, 494, 513, 547, 585, 619, 651, 687, 719, 755, 797, 816, 856, 888, 907, 931, 952, 983, 1001, 1042, 1072, 1083, 1114, 1137, 1170, 1194, 1218, 1240, 1253, 1265, 1291, 1305, 1326, 1344, 1349, 1358, 1373, 1407, 1441, 1463, 1483, 1502, 1535, 1554} +var _Key_index = [...]uint16{0, 5, 14, 36, 54, 76, 113, 152, 166, 207, 233, 252, 286, 298, 329, 353, 374, 402, 432, 460, 481, 494, 513, 547, 585, 619, 651, 687, 719, 755, 797, 816, 856, 888, 907, 931, 952, 983, 1001, 1042, 1072, 1083, 1114, 1137, 1170, 1194, 1218, 1240, 1253, 1265, 1291, 1305, 1326, 1344, 1349, 1358, 1373, 1407, 1441, 1463, 1483, 1502, 1535, 1554, 1574} func (i Key) String() string { if i < 0 || i >= Key(len(_Key_index)-1) { diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go index d8ad1988a18e..da319d2e975a 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go @@ -428,32 +428,21 @@ func assertSSTContents(sst []byte, sstTimestamp hlc.Timestamp, stats *enginepb.M defer iter.Close() // Check SST KV pairs. - iter.SeekGE(storage.MVCCKey{Key: keys.MinKey}) - for { - ok, err := iter.Valid() - if err != nil { + for iter.SeekGE(storage.MVCCKey{Key: keys.MinKey}); ; iter.Next() { + if ok, err := iter.Valid(); err != nil { return err - } - if !ok { + } else if !ok { break } - key, valueRaw := iter.UnsafeKey(), iter.UnsafeValue() - value, err := storage.DecodeMVCCValue(valueRaw) - if err != nil { - return err - } + key := iter.UnsafeKey() if key.Timestamp.IsEmpty() { return errors.AssertionFailedf("SST contains inline value or intent for key %s", key) } - if value.IsTombstone() { - return errors.AssertionFailedf("SST contains tombstone for key %s", key) - } if sstTimestamp.IsSet() && key.Timestamp != sstTimestamp { return errors.AssertionFailedf("SST has unexpected timestamp %s (expected %s) for key %s", key.Timestamp, sstTimestamp, key.Key) } - iter.Next() } // Compare statistics with those passed by client. We calculate them at the diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go index 3de05a8b8ef9..bc1f9d96d251 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go @@ -67,15 +67,17 @@ func TestEvalAddSSTable(t *testing.T) { }{ // Blind writes. "blind writes below existing": { - data: []sstutil.KV{{"a", 5, "a5"}, {"b", 7, ""}}, - sst: []sstutil.KV{{"a", 3, "sst"}, {"b", 2, "sst"}}, - expect: []sstutil.KV{{"a", 5, "a5"}, {"a", 3, "sst"}, {"b", 7, ""}, {"b", 2, "sst"}}, + data: []sstutil.KV{{"a", 5, "a5"}, {"b", 7, ""}, {"c", 6, "c6"}}, + sst: []sstutil.KV{{"a", 3, "sst"}, {"b", 2, "sst"}, {"c", 3, ""}}, + expect: []sstutil.KV{ + {"a", 5, "a5"}, {"a", 3, "sst"}, {"b", 7, ""}, {"b", 2, "sst"}, {"c", 6, "c6"}, {"c", 3, ""}, + }, expectStatsEst: true, }, "blind replaces existing": { - data: []sstutil.KV{{"a", 2, "a2"}}, - sst: []sstutil.KV{{"a", 2, "sst"}}, - expect: []sstutil.KV{{"a", 2, "sst"}}, + data: []sstutil.KV{{"a", 2, "a2"}, {"b", 2, "b2"}}, + sst: []sstutil.KV{{"a", 2, "sst"}, {"b", 2, ""}}, + expect: []sstutil.KV{{"a", 2, "sst"}, {"b", 2, ""}}, expectStatsEst: true, }, "blind errors on AddSSTableRequireAtRequestTimestamp": { @@ -100,11 +102,10 @@ func TestEvalAddSSTable(t *testing.T) { expect: []sstutil.KV{{"b", intentTS, "b0"}, {"c", 1, "sst"}, {"d", 1, "sst"}}, expectStatsEst: true, }, - "blind writes tombstones unless race": { // unfortunately, for performance + "blind writes tombstones": { sst: []sstutil.KV{{"a", 1, ""}}, expect: []sstutil.KV{{"a", 1, ""}}, expectStatsEst: true, - expectErrRace: `SST contains tombstone for key "a"/0.000000001,0`, }, "blind writes SST inline values unless race": { // unfortunately, for performance sst: []sstutil.KV{{"a", 0, "inline"}}, @@ -135,12 +136,11 @@ func TestEvalAddSSTable(t *testing.T) { expect: []sstutil.KV{{"a", 10, "a1"}, {"b", 10, "b1"}}, expectStatsEst: true, }, - "SSTTimestampToRequestTimestamp writes tombstones unless race": { // unfortunately, for performance + "SSTTimestampToRequestTimestamp writes tombstones": { reqTS: 10, toReqTS: 1, sst: []sstutil.KV{{"a", 1, ""}}, expect: []sstutil.KV{{"a", 10, ""}}, - expectErrRace: `SST contains tombstone for key "a"/0.000000001,0`, expectStatsEst: true, }, "SSTTimestampToRequestTimestamp rejects incorrect SST timestamp": { @@ -221,9 +221,9 @@ func TestEvalAddSSTable(t *testing.T) { reqTS: 5, toReqTS: 1, noShadow: true, - data: []sstutil.KV{{"a", 5, "a5"}, {"b", 5, "b5"}}, - sst: []sstutil.KV{{"a", 1, "a5"}, {"b", 1, "b5"}}, - expect: []sstutil.KV{{"a", 5, "a5"}, {"b", 5, "b5"}}, + data: []sstutil.KV{{"a", 5, "a5"}, {"b", 5, "b5"}, {"c", 5, ""}}, + sst: []sstutil.KV{{"a", 1, "a5"}, {"b", 1, "b5"}, {"c", 1, ""}}, + expect: []sstutil.KV{{"a", 5, "a5"}, {"b", 5, "b5"}, {"c", 5, ""}}, }, "SSTTimestampToRequestTimestamp errors with DisallowShadowingBelow equal value above existing below limit": { reqTS: 7, @@ -295,18 +295,16 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 3, "a3"}}, expectErr: &roachpb.WriteTooOldError{}, }, - "DisallowConflicts allows new SST tombstones": { // unfortunately, for performance - noConflict: true, - sst: []sstutil.KV{{"a", 3, ""}}, - expect: []sstutil.KV{{"a", 3, ""}}, - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, + "DisallowConflicts allows new SST tombstones": { + noConflict: true, + sst: []sstutil.KV{{"a", 3, ""}}, + expect: []sstutil.KV{{"a", 3, ""}}, }, - "DisallowConflicts rejects SST tombstones when shadowing": { - noConflict: true, - data: []sstutil.KV{{"a", 2, "a2"}}, - sst: []sstutil.KV{{"a", 3, ""}}, - expectErr: "SST values cannot be tombstones", - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, + "DisallowConflicts allows SST tombstones when shadowing": { + noConflict: true, + data: []sstutil.KV{{"a", 2, "a2"}}, + sst: []sstutil.KV{{"a", 3, ""}}, + expect: []sstutil.KV{{"a", 3, ""}, {"a", 2, "a2"}}, }, "DisallowConflicts allows new SST inline values": { // unfortunately, for performance noConflict: true, @@ -383,18 +381,16 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 3, "a3"}}, expect: []sstutil.KV{{"a", 3, "a3"}}, }, - "DisallowShadowing allows new SST tombstones": { // unfortunately, for performance - noShadow: true, - sst: []sstutil.KV{{"a", 3, ""}}, - expect: []sstutil.KV{{"a", 3, ""}}, - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, + "DisallowShadowing allows new SST tombstones": { + noShadow: true, + sst: []sstutil.KV{{"a", 3, ""}}, + expect: []sstutil.KV{{"a", 3, ""}}, }, "DisallowShadowing rejects SST tombstones when shadowing": { - noShadow: true, - data: []sstutil.KV{{"a", 2, "a2"}}, - sst: []sstutil.KV{{"a", 3, ""}}, - expectErr: "SST values cannot be tombstones", - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, + noShadow: true, + data: []sstutil.KV{{"a", 2, "a2"}}, + sst: []sstutil.KV{{"a", 3, ""}}, + expectErr: `ingested key collides with an existing one: "a"`, }, "DisallowShadowing allows new SST inline values": { // unfortunately, for performance noShadow: true, @@ -502,18 +498,22 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 3, "a3"}}, expectErr: `ingested key collides with an existing one: "a"`, }, - "DisallowShadowingBelow allows new SST tombstones": { // unfortunately, for performance + "DisallowShadowingBelow is not generally idempotent with tombstones": { + noShadowBelow: 5, + data: []sstutil.KV{{"a", 3, ""}}, + sst: []sstutil.KV{{"a", 3, ""}}, + expectErr: &roachpb.WriteTooOldError{}, + }, + "DisallowShadowingBelow allows new SST tombstones": { noShadowBelow: 5, sst: []sstutil.KV{{"a", 3, ""}}, expect: []sstutil.KV{{"a", 3, ""}}, - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, }, - "DisallowShadowingBelow rejects SST tombstones when shadowing": { + "DisallowShadowingBelow rejects SST tombstones when shadowing below": { noShadowBelow: 5, data: []sstutil.KV{{"a", 2, "a2"}}, sst: []sstutil.KV{{"a", 3, ""}}, - expectErr: "SST values cannot be tombstones", - expectErrRace: `SST contains tombstone for key "a"/0.000000003,0`, + expectErr: `ingested key collides with an existing one: "a"`, }, "DisallowShadowingBelow allows new SST inline values": { // unfortunately, for performance noShadowBelow: 5, @@ -558,6 +558,12 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 3, "sst"}, {"b", 1, "sst"}}, expectErr: `ingested key collides with an existing one: "b"`, }, + "DisallowShadowingBelow tombstone above tombstone": { + noShadowBelow: 5, + data: []sstutil.KV{{"a", 2, ""}, {"a", 1, "a1"}}, + sst: []sstutil.KV{{"a", 3, ""}}, + expect: []sstutil.KV{{"a", 3, ""}, {"a", 2, ""}, {"a", 1, "a1"}}, + }, "DisallowShadowingBelow at limit writes": { noShadowBelow: 5, sst: []sstutil.KV{{"a", 5, "sst"}}, @@ -598,6 +604,12 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 7, "sst"}}, expectErr: `ingested key collides with an existing one: "a"`, }, + "DisallowShadowingBelow tombstone above limit errors on existing below limit": { + noShadowBelow: 5, + data: []sstutil.KV{{"a", 4, "a4"}}, + sst: []sstutil.KV{{"a", 7, ""}}, + expectErr: `ingested key collides with an existing one: "a"`, + }, "DisallowShadowingBelow above limit errors on existing below limit with same value": { noShadowBelow: 5, data: []sstutil.KV{{"a", 4, "a4"}}, @@ -640,6 +652,12 @@ func TestEvalAddSSTable(t *testing.T) { sst: []sstutil.KV{{"a", 7, "a7"}}, expect: []sstutil.KV{{"a", 7, "a7"}}, }, + "DisallowShadowingBelow above limit is idempotent with tombstone": { + noShadowBelow: 5, + data: []sstutil.KV{{"a", 7, ""}}, + sst: []sstutil.KV{{"a", 7, ""}}, + expect: []sstutil.KV{{"a", 7, ""}}, + }, "DisallowShadowingBelow above limit errors below existing": { noShadowBelow: 5, data: []sstutil.KV{{"a", 8, "a8"}}, @@ -764,9 +782,8 @@ func TestEvalAddSSTable(t *testing.T) { UpperBound: keys.MaxKey, }) defer iter.Close() - iter.SeekGE(storage.MVCCKey{Key: keys.SystemPrefix}) scan := []sstutil.KV{} - for { + for iter.SeekGE(storage.MVCCKey{Key: keys.SystemPrefix}); ; iter.Next() { ok, err := iter.Valid() require.NoError(t, err) if !ok { @@ -787,23 +804,21 @@ func TestEvalAddSSTable(t *testing.T) { require.NoError(t, protoutil.Unmarshal(iter.UnsafeValue(), &meta)) if meta.RawBytes == nil { // Skip intent metadata records (value emitted separately). - iter.Next() continue } value, err = roachpb.Value{RawBytes: meta.RawBytes}.GetBytes() require.NoError(t, err) } scan = append(scan, sstutil.KV{key, ts, string(value)}) - iter.Next() } require.Equal(t, tc.expect, scan) // Check that stats were updated correctly. if tc.expectStatsEst { - require.True(t, stats.ContainsEstimates > 0, "expected stats to be estimated") + require.NotZero(t, stats.ContainsEstimates, "expected stats to be estimated") } else { - require.False(t, stats.ContainsEstimates > 0, "found estimated stats") - require.Equal(t, stats, engineStats(t, engine, stats.LastUpdateNanos)) + require.Zero(t, stats.ContainsEstimates, "found estimated stats") + require.Equal(t, engineStats(t, engine, stats.LastUpdateNanos), stats) } }) } @@ -1121,6 +1136,7 @@ func TestAddSSTableMVCCStats(t *testing.T) { {"d", 1, "d"}, {"d", 2, ""}, {"e", 1, "e"}, + {"u", 3, "u"}, {"z", 2, "zzzzzz"}, } { require.NoError(t, engine.PutMVCC(kv.MVCCKey(), kv.MVCCValue())) @@ -1133,14 +1149,16 @@ func TestAddSSTableMVCCStats(t *testing.T) { {"d", 4, "dddd"}, // mvcc-shadow existing deleted d. {"e", 4, "eeee"}, // mvcc-shadow existing 1b. {"j", 2, "jj"}, // no colission – via MVCC or LSM – with existing. + {"t", 3, ""}, // tombstone, no collission + {"u", 5, ""}, // tombstone, shadows existing }) statsDelta := enginepb.MVCCStats{ - // the sst will think it added 4 keys here, but a, c, and e shadow or are shadowed. - LiveCount: -3, - LiveBytes: -109, - // the sst will think it added 5 keys, but only j is new so 4 are over-counted. - KeyCount: -4, - KeyBytes: -20, + // the sst will think it added 5 keys here, but a, c, e, and t shadow or are shadowed. + LiveCount: -4, + LiveBytes: -129, + // the sst will think it added 5 keys, but only j and t are new so 5 are over-counted. + KeyCount: -5, + KeyBytes: -22, // the sst will think it added 6 values, but since one was a perfect (key+ts) // collision, it *replaced* the existing value and is over-counted. ValCount: -1, @@ -1229,6 +1247,7 @@ func TestAddSSTableMVCCStatsDisallowShadowing(t *testing.T) { {"b", 6, ""}, {"g", 5, "gg"}, {"r", 1, "rr"}, + {"t", 3, ""}, {"y", 1, "yy"}, {"y", 2, ""}, {"y", 5, "yyy"}, @@ -1298,12 +1317,14 @@ func TestAddSSTableMVCCStatsDisallowShadowing(t *testing.T) { // Check that there has been no double counting of stats. All keys in second SST are shadowing. require.Equal(t, firstSSTStats, *cArgs.Stats) - // Evaluate the third SST. Two of the three KVs are perfectly shadowing, but - // there is one valid KV which should contribute to the stats. + // Evaluate the third SST. Some of the KVs are perfectly shadowing, but there + // are two valid KVs which should contribute to the stats. sst, start, end = sstutil.MakeSST(t, st, []sstutil.KV{ {"c", 2, "bb"}, // key has the same timestamp and value as the one present in the existing data. {"e", 2, "ee"}, {"h", 6, "hh"}, // key has the same timestamp and value as the one present in the existing data. + {"t", 3, ""}, // identical to existing tombstone. + {"x", 7, ""}, // new tombstone. }) cArgs.Args = &roachpb.AddSSTableRequest{ @@ -1315,15 +1336,15 @@ func TestAddSSTableMVCCStatsDisallowShadowing(t *testing.T) { _, err = batcheval.EvalAddSSTable(ctx, engine, cArgs, &roachpb.AddSSTableResponse{}) require.NoError(t, err) - // This is the stats contribution of the KV {"e", 2, "ee"}. This should be - // the only addition to the cumulative stats, as the other two KVs are - // perfect shadows of existing data. + // This is the stats contribution of the KVs {"e", 2, "ee"} and {"x", 7, ""}. + // This should be the only addition to the cumulative stats, as the other + // KVs are perfect shadows of existing data. delta := enginepb.MVCCStats{ LiveCount: 1, LiveBytes: 21, - KeyCount: 1, - KeyBytes: 14, - ValCount: 1, + KeyCount: 2, + KeyBytes: 28, + ValCount: 2, ValBytes: 7, } @@ -1489,8 +1510,6 @@ func engineStats(t *testing.T, engine storage.Engine, nowNanos int64) *enginepb. UpperBound: keys.MaxKey, }) defer iter.Close() - // We don't care about nowNanos, because the SST can't contain intents or - // tombstones and all existing intents will be resolved. stats, err := storage.ComputeStatsForRange(iter, keys.LocalMax, keys.MaxKey, nowNanos) require.NoError(t, err) return &stats diff --git a/pkg/roachpb/api.proto b/pkg/roachpb/api.proto index b423a972a295..b7f558761ada 100644 --- a/pkg/roachpb/api.proto +++ b/pkg/roachpb/api.proto @@ -1688,8 +1688,13 @@ message AdminVerifyProtectedTimestampResponse { // AddSSTableRequest contains arguments to the AddSSTable method, which links an // SST file into the Pebble log-structured merge-tree. The SST must only contain // committed versioned values with non-zero MVCC timestamps (no intents or -// inline values) and no tombstones. It cannot be used in a transaction, cannot -// be split across ranges, and must be alone in a batch. +// inline values). It cannot be used in a transaction, cannot be split across +// ranges, and must be alone in a batch. +// +// In 22.1, AddSSTable does not support MVCC point tombstones in SSTs, and may +// return an error when given these. 22.2 does allow this, but check the +// AddSSTableTombstones version gate first to make sure there are no 22.1 nodes +// left in the cluster. // // By default, AddSSTable will blindly write the SST contents into Pebble, with // fixed MVCC timestamps unaffected by pushes. This can violate many CRDB diff --git a/pkg/storage/sst.go b/pkg/storage/sst.go index 36fffbbf6abc..64204b93462f 100644 --- a/pkg/storage/sst.go +++ b/pkg/storage/sst.go @@ -38,8 +38,8 @@ import ( // is non-empty, disallowShadowing is ignored. // // The given SST and reader cannot contain intents or inline values (i.e. zero -// timestamps), nor tombstones (i.e. empty values), but this is only checked for -// keys that exist in both sides, for performance. +// timestamps), but this is only checked for keys that exist in both sides, for +// performance. // // The returned MVCC statistics is a delta between the SST-only statistics and // their effect when applied, which when added to the SST statistics will adjust @@ -98,9 +98,6 @@ func CheckSSTConflicts( if err != nil { return err } - if sstValue.IsTombstone() { - return errors.New("SST values cannot be tombstones") - } if !extKey.IsValue() { var mvccMeta enginepb.MVCCMetadata if err = extIter.ValueProto(&mvccMeta); err != nil { @@ -149,15 +146,19 @@ func CheckSSTConflicts( totalBytes := metaKeySize + metaValSize // Update the skipped stats to account for the skipped meta key. - statsDiff.LiveBytes -= totalBytes - statsDiff.LiveCount-- + if !sstValue.IsTombstone() { + statsDiff.LiveBytes -= totalBytes + statsDiff.LiveCount-- + } statsDiff.KeyBytes -= metaKeySize statsDiff.ValBytes -= metaValSize statsDiff.KeyCount-- // Update the stats to account for the skipped versioned key/value. totalBytes = int64(len(sstValueRaw)) + MVCCVersionTimestampSize - statsDiff.LiveBytes -= totalBytes + if !sstValue.IsTombstone() { + statsDiff.LiveBytes -= totalBytes + } statsDiff.KeyBytes -= MVCCVersionTimestampSize statsDiff.ValBytes -= int64(len(sstValueRaw)) statsDiff.ValCount--