From 00dccf589c0ccb1909a22d4956f398d6f3118f6f Mon Sep 17 00:00:00 2001 From: Aditya Maru Date: Thu, 19 Aug 2021 13:34:09 -0400 Subject: [PATCH 1/4] Revert "backupccl: protect entire keyspan during cluster backup" This reverts commit 1b5fd4f000cfd8e5085b0b67e6db0d9fa5feb69b. The commit above laid a pts record over the entire table keyspace. This did not account for two things (with the potential of there being more): 1. System tables that we do not backup could have a short GC TTL, and so incremental backups that attempt to protect from `StartTime` of the previous backup would fail. 2. Dropped tables often have a short GC TTL to clear data once they have been dropped. This change would also attempt to protect "dropped but not gc'ed tables" even though we exclude them from the backup, and fail on pts verification. One suggested approach is to exclude all objects we do not backup by subtracting these spans from {TableDataMin, TableDataMax}. This works for system tables, and dropped but not gc'ed tables, but breaks for dropped and gc'ed tables. A pts verification would still find the leaseholder of the empty span and attempt to protect below the gc threshold. In conclusion, we need to think about the semantics a little more before we rush to protect a single keyspan. --- pkg/ccl/backupccl/BUILD.bazel | 1 - pkg/ccl/backupccl/backup_planning.go | 17 +-------- pkg/ccl/backupccl/backup_test.go | 55 ---------------------------- 3 files changed, 2 insertions(+), 71 deletions(-) diff --git a/pkg/ccl/backupccl/BUILD.bazel b/pkg/ccl/backupccl/BUILD.bazel index 9054ca0157bb..84acda5664e2 100644 --- a/pkg/ccl/backupccl/BUILD.bazel +++ b/pkg/ccl/backupccl/BUILD.bazel @@ -168,7 +168,6 @@ go_test( "//pkg/kv/kvclient/kvcoord:with-mocks", "//pkg/kv/kvserver", "//pkg/kv/kvserver/kvserverbase", - "//pkg/kv/kvserver/protectedts/ptstorage", "//pkg/roachpb:with-mocks", "//pkg/scheduledjobs", "//pkg/security", diff --git a/pkg/ccl/backupccl/backup_planning.go b/pkg/ccl/backupccl/backup_planning.go index 643ee6f5324f..c6685b1da611 100644 --- a/pkg/ccl/backupccl/backup_planning.go +++ b/pkg/ccl/backupccl/backup_planning.go @@ -1312,18 +1312,6 @@ func backupPlanHook( CreatedBy: backupStmt.CreatedByInfo, } - spansToProtect := spans - // If this is a full cluster backup from the system tenant then we write a - // single protected timestamp record spanning the entire keyspace. - if backupStmt.Coverage() == tree.AllDescriptors { - if p.ExecCfg().Codec.ForSystemTenant() { - spansToProtect = []roachpb.Span{{ - Key: keys.TableDataMin, - EndKey: keys.TableDataMax, - }} - } - } - if backupStmt.Options.Detached { // When running inside an explicit transaction, we simply create the job // record. We do not wait for the job to finish. @@ -1341,7 +1329,7 @@ func backupPlanHook( // The protect timestamp logic for a DETACHED BACKUP can be run within the // same txn as the BACKUP is being planned in, because we do not wait for // the BACKUP job to complete. - err = protectTimestampForBackup(ctx, p, p.ExtendedEvalContext().Txn, jobID, spansToProtect, + err = protectTimestampForBackup(ctx, p, p.ExtendedEvalContext().Txn, jobID, spans, startTime, endTime, backupDetails) if err != nil { return err @@ -1375,8 +1363,7 @@ func backupPlanHook( if err := doWriteBackupManifestCheckpoint(ctx, jobID); err != nil { return err } - if err := protectTimestampForBackup(ctx, p, plannerTxn, jobID, spansToProtect, startTime, - endTime, backupDetails); err != nil { + if err := protectTimestampForBackup(ctx, p, plannerTxn, jobID, spans, startTime, endTime, backupDetails); err != nil { return err } diff --git a/pkg/ccl/backupccl/backup_test.go b/pkg/ccl/backupccl/backup_test.go index 9aa619838060..9d1585a6523d 100644 --- a/pkg/ccl/backupccl/backup_test.go +++ b/pkg/ccl/backupccl/backup_test.go @@ -26,7 +26,6 @@ import ( "regexp" "strconv" "strings" - "sync" "sync/atomic" "testing" "time" @@ -50,7 +49,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" - "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptstorage" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -6181,59 +6179,6 @@ func getTableID(db *kv.DB, dbName, tableName string) descpb.ID { return desc.GetID() } -func TestProtectedTimestampSpanSelectionClusterBackup(t *testing.T) { - defer leaktest.AfterTest(t)() - defer log.Scope(t).Close(t) - - const numAccounts = 1 - serverArgs := base.TestServerArgs{Knobs: base.TestingKnobs{JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals()}} - params := base.TestClusterArgs{ServerArgs: serverArgs} - allowRequest := make(chan struct{}) - params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{ - TestingRequestFilter: func(ctx context.Context, request roachpb.BatchRequest) *roachpb.Error { - for _, ru := range request.Requests { - switch ru.GetInner().(type) { - case *roachpb.ExportRequest: - <-allowRequest - } - } - return nil - }, - } - - _, tc, sqlDB, _, cleanupFn := backupRestoreTestSetupWithParams(t, singleNode, numAccounts, - InitManualReplication, params) - defer cleanupFn() - - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - sqlDB.Exec(t, `BACKUP TO 'nodelocal://self/foo'`) - }() - - var jobID string - conn := tc.Conns[0] - testutils.SucceedsSoon(t, func() error { - row := conn.QueryRow("SELECT job_id FROM [SHOW JOBS] ORDER BY created DESC LIMIT 1") - return row.Scan(&jobID) - }) - - // Check protected timestamp record to ensure we are protecting the clusters - // key-span. - var spans []byte - sqlDB.QueryRow(t, `SELECT spans FROM system.protected_ts_records LIMIT 1`).Scan(&spans) - var protectedSpans ptstorage.Spans - require.NoError(t, protoutil.Unmarshal(spans, &protectedSpans)) - - expectedSpans := ptstorage.Spans{ - Spans: []roachpb.Span{{Key: keys.TableDataMin, EndKey: keys.TableDataMax}}, - } - require.Equal(t, expectedSpans, protectedSpans) - close(allowRequest) - wg.Wait() -} - // TestSpanSelectionDuringBackup tests the method spansForAllTableIndexes which // is used to resolve the spans which will be backed up, and spans for which // protected ts records will be created. From 2b179422ba70badaf949e5e011b894a15c4b043d Mon Sep 17 00:00:00 2001 From: Josh Imhoff Date: Tue, 10 Aug 2021 09:36:07 -0400 Subject: [PATCH 2/4] keys: add a range-local key for probing This commit introduces a range-local key for probing. The key will only be used by probing components like kvprober. This means no contention with user-traffic or other CRDB components. This key also provides a safe place to write to in order to test write availabilty. A kvprober that does writes is coming soon. Release note: None. --- pkg/keys/constants.go | 2 ++ pkg/keys/doc.go | 1 + pkg/keys/keys.go | 7 +++++++ pkg/keys/printer.go | 1 + pkg/keys/printer_test.go | 1 + 5 files changed, 12 insertions(+) diff --git a/pkg/keys/constants.go b/pkg/keys/constants.go index bf61c547d8d1..401457ad1692 100644 --- a/pkg/keys/constants.go +++ b/pkg/keys/constants.go @@ -137,6 +137,8 @@ var ( // key info, such as the txn ID in the case of a transaction record. LocalRangePrefix = roachpb.Key(makeKey(LocalPrefix, roachpb.RKey("k"))) LocalRangeMax = LocalRangePrefix.PrefixEnd() + // LocalRangeProbeSuffix is the suffix for keys for probing. + LocalRangeProbeSuffix = roachpb.RKey("prbe") // LocalQueueLastProcessedSuffix is the suffix for replica queue state keys. LocalQueueLastProcessedSuffix = roachpb.RKey("qlpt") // LocalRangeDescriptorSuffix is the suffix for keys storing diff --git a/pkg/keys/doc.go b/pkg/keys/doc.go index 1b0227008260..a4f00c5dabb8 100644 --- a/pkg/keys/doc.go +++ b/pkg/keys/doc.go @@ -210,6 +210,7 @@ var _ = [...]interface{}{ // as a whole. They are replicated and addressable. Typical examples are // the range descriptor and transaction records. They all share // `LocalRangePrefix`. + RangeProbeKey, // "prbe" QueueLastProcessedKey, // "qlpt" RangeDescriptorKey, // "rdsc" TransactionKey, // "txn-" diff --git a/pkg/keys/keys.go b/pkg/keys/keys.go index d833585db5b1..4f0cb8c15851 100644 --- a/pkg/keys/keys.go +++ b/pkg/keys/keys.go @@ -417,6 +417,13 @@ func QueueLastProcessedKey(key roachpb.RKey, queue string) roachpb.Key { return MakeRangeKey(key, LocalQueueLastProcessedSuffix, roachpb.RKey(queue)) } +// RangeProbeKey returns a range-local key for probing. The +// purpose of the key is to test CRDB in production; if any data is present at +// the key, it has no purpose except in allowing testing CRDB in production. +func RangeProbeKey(key roachpb.RKey) roachpb.Key { + return MakeRangeKey(key, LocalRangeProbeSuffix, nil) +} + // LockTableSingleKey creates a key under which all single-key locks for the // given key can be found. buf is used as scratch-space, up to its capacity, // to avoid allocations -- its contents will be overwritten and not appended diff --git a/pkg/keys/printer.go b/pkg/keys/printer.go index 1a345bc7eed4..e60a0831104b 100644 --- a/pkg/keys/printer.go +++ b/pkg/keys/printer.go @@ -183,6 +183,7 @@ var ( {name: "RangeDescriptor", suffix: LocalRangeDescriptorSuffix, atEnd: true}, {name: "Transaction", suffix: LocalTransactionSuffix, atEnd: false}, {name: "QueueLastProcessed", suffix: LocalQueueLastProcessedSuffix, atEnd: false}, + {name: "RangeProbe", suffix: LocalRangeProbeSuffix, atEnd: true}, } ) diff --git a/pkg/keys/printer_test.go b/pkg/keys/printer_test.go index 9291d81cce09..579ec2480005 100644 --- a/pkg/keys/printer_test.go +++ b/pkg/keys/printer_test.go @@ -87,6 +87,7 @@ func TestPrettyPrint(t *testing.T) { {keys.MakeRangeKeyPrefix(roachpb.RKey(tenSysCodec.TablePrefix(42))), `/Local/Range/Table/42`, revertSupportUnknown}, {keys.RangeDescriptorKey(roachpb.RKey(tenSysCodec.TablePrefix(42))), `/Local/Range/Table/42/RangeDescriptor`, revertSupportUnknown}, {keys.TransactionKey(tenSysCodec.TablePrefix(42), txnID), fmt.Sprintf(`/Local/Range/Table/42/Transaction/%q`, txnID), revertSupportUnknown}, + {keys.RangeProbeKey(roachpb.RKey(tenSysCodec.TablePrefix(42))), `/Local/Range/Table/42/RangeProbe`, revertSupportUnknown}, {keys.QueueLastProcessedKey(roachpb.RKey(tenSysCodec.TablePrefix(42)), "foo"), `/Local/Range/Table/42/QueueLastProcessed/"foo"`, revertSupportUnknown}, {lockTableKey(keys.RangeDescriptorKey(roachpb.RKey(tenSysCodec.TablePrefix(42)))), `/Local/Lock/Intent/Local/Range/Table/42/RangeDescriptor`, revertSupportUnknown}, {lockTableKey(tenSysCodec.TablePrefix(111)), "/Local/Lock/Intent/Table/111", revertSupportUnknown}, From f41ac2486f0b3254498c06f5df0670b2175ec036 Mon Sep 17 00:00:00 2001 From: Josh Imhoff Date: Tue, 10 Aug 2021 09:42:51 -0400 Subject: [PATCH 3/4] kvprober: probe the range-local key dedicated to probing Before this commit, kvprober probed the start key of a range. This worked okay, as kvprober only did reads, and contention issues leading to false positive pages haven't happened in practice. But contention issues are possible, as there may be data located at the start key of the range. With this commit, kvprober probes the range-local key dedicated to probing. No contention issues are possible, as that key is only for probing. This key is also needed for write probes, which are coming soon. Release note: None. --- pkg/kv/kvprober/kvprober.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index c1a5bbf3a577..5585b56b9521 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -22,6 +22,7 @@ import ( "math/rand" "time" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/util/contextutil" @@ -198,11 +199,11 @@ func (p *Prober) probe(ctx context.Context, db dbGet) { // perspective of the user. timeout := readTimeout.Get(&p.settings.SV) err = contextutil.RunWithTimeout(ctx, "db.Get", timeout, func(ctx context.Context) error { - // We read the start key for the range. There may be no data at the key, - // but that is okay. Even if there is no data at the key, the prober still - // executes a basic read operation on the range. + // We read a "range-local" key dedicated to probing. See pkg/keys for more. + // There is no data at the key, but that is okay. Even tho there is no data + // at the key, the prober still executes a read operation on the range. // TODO(josh): Trace the probes. - _, err = db.Get(ctx, step.StartKey) + _, err = db.Get(ctx, keys.RangeProbeKey(step.StartKey)) return err }) if err != nil { From 54282ad5728ded88c7b40c1c663c7e57ff6ac2b9 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Thu, 24 Jun 2021 11:11:44 -0400 Subject: [PATCH 4/4] cli,storage: add emergency ballast Add an automatically created, on-by-default emergency ballast file. This new ballast defaults to the minimum of 1% total disk capacity or 1GiB. The size of the ballast may be configured via the `--store` flag with a `ballast-size` field, accepting the same value formats as the `size` field. The ballast is automatically created when either available disk space is at least four times the ballast size or when available disk space after creating the ballast is at least 10 GiB. Creation of the ballast happens either when the engine is opened or during the periodic Capacity calculations driven by the `kvserver.Store`. During node start, if available disk space is less than or equal to half the ballast size, exit immediately with a new Disk Full (10) exit code. See #66493. Release note (ops change): Add an automatically created, on by default emergency ballast file. This new ballast defaults to the minimum of 1% total disk capacity or 1GiB. The size of the ballast may be configured via the `--store` flag with a `ballast-size` field, accepting the same value formats as the `size` field. Also, add a new Disk Full (10) exit code that indicates that the node exited because disk space on at least one store is exhausted. On node start, if any store has less than half the ballast's size bytes available, the node immediately exits with the Disk Full (10) exit code. The operator may manually remove the configured ballast (assuming they haven't already) to allow the node to start, and they can take action to remedy the disk space exhaustion. The ballast will automatically be recreated when available disk space is 4x the ballast size, or at least 10 GiB is available after the ballast is created. --- build/teamcity-support.sh | 6 + pkg/base/config.go | 3 + pkg/base/store_spec.go | 66 +++++++--- pkg/base/store_spec_test.go | 7 ++ pkg/cli/exit/codes.go | 4 + pkg/cli/start.go | 52 ++++++++ pkg/cli/start_test.go | 7 ++ pkg/server/config.go | 12 +- pkg/storage/BUILD.bazel | 5 + pkg/storage/ballast.go | 153 +++++++++++++++++++++++ pkg/storage/ballast_test.go | 240 ++++++++++++++++++++++++++++++++++++ pkg/storage/pebble.go | 88 ++++++++++--- 12 files changed, 611 insertions(+), 32 deletions(-) create mode 100644 pkg/storage/ballast.go create mode 100644 pkg/storage/ballast_test.go diff --git a/build/teamcity-support.sh b/build/teamcity-support.sh index 2fb0212ad1bf..ab3a5273b028 100644 --- a/build/teamcity-support.sh +++ b/build/teamcity-support.sh @@ -102,6 +102,12 @@ function run_json_test() { rm -f "${fullfile}" fi rm -f "${tmpfile}" artifacts/stripped.txt + + # Some unit tests test automatic ballast creation. These ballasts can be + # larger than the maximum artifact size. Remove any artifacts with the + # EMERGENCY_BALLAST filename. + find artifacts -name "EMERGENCY_BALLAST" -delete + tc_end_block "artifacts" # Make it easier to figure out whether we're exiting because of a test failure diff --git a/pkg/base/config.go b/pkg/base/config.go index 6929b69909d9..861e1ed955c3 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -510,6 +510,9 @@ type StorageConfig struct { // MaxSize is used for calculating free space and making rebalancing // decisions. Zero indicates that there is no maximum size. MaxSize int64 + // BallastSize is the amount reserved by a ballast file for manual + // out-of-disk recovery. + BallastSize int64 // Settings instance for cluster-wide knobs. Settings *cluster.Settings // UseFileRegistry is true if the file registry is needed (eg: encryption-at-rest). diff --git a/pkg/base/store_spec.go b/pkg/base/store_spec.go index 52e1d0b8ddc2..6ce9e9c89724 100644 --- a/pkg/base/store_spec.go +++ b/pkg/base/store_spec.go @@ -29,6 +29,7 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" "github.com/cockroachdb/pebble" + "github.com/cockroachdb/redact" humanize "github.com/dustin/go-humanize" "github.com/spf13/pflag" ) @@ -79,7 +80,7 @@ type floatInterval struct { // NewSizeSpec parses the string passed into a --size flag and returns a // SizeSpec if it is correctly parsed. func NewSizeSpec( - value string, bytesRange *intInterval, percentRange *floatInterval, + field redact.SafeString, value string, bytesRange *intInterval, percentRange *floatInterval, ) (SizeSpec, error) { var size SizeSpec if fractionRegex.MatchString(value) { @@ -93,13 +94,14 @@ func NewSizeSpec( size.Percent, err = strconv.ParseFloat(factorValue, 64) size.Percent *= percentFactor if err != nil { - return SizeSpec{}, fmt.Errorf("could not parse store size (%s) %s", value, err) + return SizeSpec{}, errors.Newf("could not parse %s size (%s) %s", field, value, err) } if percentRange != nil { if (percentRange.min != nil && size.Percent < *percentRange.min) || (percentRange.max != nil && size.Percent > *percentRange.max) { - return SizeSpec{}, fmt.Errorf( - "store size (%s) must be between %f%% and %f%%", + return SizeSpec{}, errors.Newf( + "%s size (%s) must be between %f%% and %f%%", + field, value, *percentRange.min, *percentRange.max, @@ -110,16 +112,16 @@ func NewSizeSpec( var err error size.InBytes, err = humanizeutil.ParseBytes(value) if err != nil { - return SizeSpec{}, fmt.Errorf("could not parse store size (%s) %s", value, err) + return SizeSpec{}, errors.Newf("could not parse %s size (%s) %s", field, value, err) } if bytesRange != nil { if bytesRange.min != nil && size.InBytes < *bytesRange.min { - return SizeSpec{}, fmt.Errorf("store size (%s) must be larger than %s", value, - humanizeutil.IBytes(*bytesRange.min)) + return SizeSpec{}, errors.Newf("%s size (%s) must be larger than %s", + field, value, humanizeutil.IBytes(*bytesRange.min)) } if bytesRange.max != nil && size.InBytes > *bytesRange.max { - return SizeSpec{}, fmt.Errorf("store size (%s) must be smaller than %s", value, - humanizeutil.IBytes(*bytesRange.max)) + return SizeSpec{}, errors.Newf("%s size (%s) must be smaller than %s", + field, value, humanizeutil.IBytes(*bytesRange.max)) } } } @@ -150,7 +152,7 @@ var _ pflag.Value = &SizeSpec{} // Set adds a new value to the StoreSpecValue. It is the important part of // pflag's value interface. func (ss *SizeSpec) Set(value string) error { - spec, err := NewSizeSpec(value, nil, nil) + spec, err := NewSizeSpec("specified", value, nil, nil) if err != nil { return err } @@ -162,10 +164,11 @@ func (ss *SizeSpec) Set(value string) error { // StoreSpec contains the details that can be specified in the cli pertaining // to the --store flag. type StoreSpec struct { - Path string - Size SizeSpec - InMemory bool - Attributes roachpb.Attributes + Path string + Size SizeSpec + BallastSize *SizeSpec + InMemory bool + Attributes roachpb.Attributes // StickyInMemoryEngineID is a unique identifier associated with a given // store which will remain in memory even after the default Engine close // until it has been explicitly cleaned up by CleanupStickyInMemEngine[s] @@ -190,6 +193,7 @@ type StoreSpec struct { // String returns a fully parsable version of the store spec. func (ss StoreSpec) String() string { + // TODO(jackson): Implement redact.SafeFormatter var buffer bytes.Buffer if len(ss.Path) != 0 { fmt.Fprintf(&buffer, "path=%s,", ss.Path) @@ -203,6 +207,14 @@ func (ss StoreSpec) String() string { if ss.Size.Percent > 0 { fmt.Fprintf(&buffer, "size=%s%%,", humanize.Ftoa(ss.Size.Percent)) } + if ss.BallastSize != nil { + if ss.BallastSize.InBytes > 0 { + fmt.Fprintf(&buffer, "ballast-size=%s,", humanizeutil.IBytes(ss.BallastSize.InBytes)) + } + if ss.BallastSize.Percent > 0 { + fmt.Fprintf(&buffer, "ballast-size=%s%%,", humanize.Ftoa(ss.BallastSize.Percent)) + } + } if len(ss.Attributes.Attrs) > 0 { fmt.Fprint(&buffer, "attrs=") for i, attr := range ss.Attributes.Attrs { @@ -308,6 +320,7 @@ func NewStoreSpec(value string) (StoreSpec, error) { var minPercent float64 = 1 var maxPercent float64 = 100 ss.Size, err = NewSizeSpec( + "store", value, &intInterval{min: &minBytesAllowed}, &floatInterval{min: &minPercent, max: &maxPercent}, @@ -315,6 +328,20 @@ func NewStoreSpec(value string) (StoreSpec, error) { if err != nil { return StoreSpec{}, err } + case "ballast-size": + var minBytesAllowed int64 + var minPercent float64 = 0 + var maxPercent float64 = 50 + ballastSize, err := NewSizeSpec( + "ballast", + value, + &intInterval{min: &minBytesAllowed}, + &floatInterval{min: &minPercent, max: &maxPercent}, + ) + if err != nil { + return StoreSpec{}, err + } + ss.BallastSize = &ballastSize case "attrs": // Check to make sure there are no duplicate attributes. attrMap := make(map[string]struct{}) @@ -384,6 +411,9 @@ func NewStoreSpec(value string) (StoreSpec, error) { if ss.Size.Percent == 0 && ss.Size.InBytes == 0 { return StoreSpec{}, fmt.Errorf("size must be specified for an in memory store") } + if ss.BallastSize != nil { + return StoreSpec{}, fmt.Errorf("ballast-size specified for in memory store") + } } else if ss.Path == "" { return StoreSpec{}, fmt.Errorf("no path specified") } @@ -417,6 +447,14 @@ func (ssl StoreSpecList) String() string { // root directory. It must not be changed without a proper migration. const AuxiliaryDir = "auxiliary" +// EmergencyBallastFile returns the path (relative to a data directory) used +// for an emergency ballast file. The returned path must be stable across +// releases (eg, we cannot change these constants), otherwise we may duplicate +// ballasts. +func EmergencyBallastFile(pathJoin func(...string) string, dataDir string) string { + return pathJoin(dataDir, AuxiliaryDir, "EMERGENCY_BALLAST") +} + // PreventedStartupFile is the filename (relative to 'dir') used for files that // can block server startup. func PreventedStartupFile(dir string) string { diff --git a/pkg/base/store_spec_test.go b/pkg/base/store_spec_test.go index 8ad799f06575..046a5d2cb7b1 100644 --- a/pkg/base/store_spec_test.go +++ b/pkg/base/store_spec_test.go @@ -128,6 +128,13 @@ target_file_size=2097152` {"size=20GiB,path=/mnt/hda1,size=20GiB", "size field was used twice in store definition", StoreSpec{}}, {"size=123TB", "no path specified", StoreSpec{}}, + // ballast size + {"path=/mnt/hda1,ballast-size=671088640", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{InBytes: 671088640}}}, + {"path=/mnt/hda1,ballast-size=20GB", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{InBytes: 20000000000}}}, + {"path=/mnt/hda1,ballast-size=1%", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{Percent: 1}}}, + {"path=/mnt/hda1,ballast-size=100.000%", "ballast size (100.000%) must be between 0.000000% and 50.000000%", StoreSpec{}}, + {"ballast-size=20GiB,path=/mnt/hda1,ballast-size=20GiB", "ballast-size field was used twice in store definition", StoreSpec{}}, + // type {"type=mem,size=20GiB", "", StoreSpec{Size: SizeSpec{InBytes: 21474836480}, InMemory: true}}, {"size=20GiB,type=mem", "", StoreSpec{Size: SizeSpec{InBytes: 21474836480}, InMemory: true}}, diff --git a/pkg/cli/exit/codes.go b/pkg/cli/exit/codes.go index ac41c99f8626..bc96e85e46b6 100644 --- a/pkg/cli/exit/codes.go +++ b/pkg/cli/exit/codes.go @@ -58,6 +58,10 @@ func TimeoutAfterFatalError() Code { return Code{8} } // during a logging operation to a network collector. func LoggingNetCollectorUnavailable() Code { return Code{9} } +// DiskFull (10) indicates an emergency shutdown in response to a +// store's full disk. +func DiskFull() Code { return Code{10} } + // Codes that are specific to client commands follow. It's possible // for codes to be reused across separate client or server commands. // Command-specific exit codes should be allocated down from 125. diff --git a/pkg/cli/start.go b/pkg/cli/start.go index aafc45c232e6..68b3e877ad11 100644 --- a/pkg/cli/start.go +++ b/pkg/cli/start.go @@ -55,6 +55,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" "github.com/cockroachdb/redact" "github.com/spf13/cobra" "google.golang.org/grpc" @@ -347,6 +349,16 @@ func runStart(cmd *cobra.Command, args []string, startSingleNode bool) (returnEr }() } + // Check for stores with full disks and exit with an informative exit + // code. This needs to happen early during start, before we perform any + // writes to the filesystem including log rotation. We need to guarantee + // that the process continues to exit with the Disk Full exit code. A + // flapping exit code can affect alerting, including the alerting + // performed within CockroachCloud. + if err := exitIfDiskFull(serverCfg.Stores.Specs); err != nil { + return err + } + // Set up a cancellable context for the entire start command. // The context will be canceled at the end. ctx, cancel := context.WithCancel(context.Background()) @@ -1015,6 +1027,46 @@ func maybeWarnMemorySizes(ctx context.Context) { } } +func exitIfDiskFull(specs []base.StoreSpec) error { + var cause error + var ballastPaths []string + var ballastMissing bool + for _, spec := range specs { + isDiskFull, err := storage.IsDiskFull(vfs.Default, spec) + if err != nil { + return err + } + if !isDiskFull { + continue + } + path := base.EmergencyBallastFile(vfs.Default.PathJoin, spec.Path) + ballastPaths = append(ballastPaths, path) + if _, err := vfs.Default.Stat(path); oserror.IsNotExist(err) { + ballastMissing = true + } + cause = errors.CombineErrors(cause, errors.Newf(`store %s: out of disk space`, spec.Path)) + } + if cause == nil { + return nil + } + + // TODO(jackson): Link to documentation surrounding the ballast. + + err := clierror.NewError(cause, exit.DiskFull()) + if ballastMissing { + return errors.WithHint(err, `At least one ballast file is missing. +You may need to replace this node because there is +insufficient disk space to start.`) + } + + ballastPathsStr := strings.Join(ballastPaths, "\n") + err = errors.WithHintf(err, `Deleting or truncating the ballast file(s) at +%s +may reclaim enough space to start. Proceed with caution. Complete +disk space exhaustion may result in node loss.`, ballastPathsStr) + return err +} + // setupAndInitializeLoggingAndProfiling does what it says on the label. // Prior to this however it determines suitable defaults for the // logging output directory and the verbosity level of stderr logging. diff --git a/pkg/cli/start_test.go b/pkg/cli/start_test.go index 11f451afc613..542839c48205 100644 --- a/pkg/cli/start_test.go +++ b/pkg/cli/start_test.go @@ -104,6 +104,13 @@ func TestStartArgChecking(t *testing.T) { {[]string{`--store=size=-1231MB`}, `store size \(-1231MB\) must be larger than`}, {[]string{`--store=size=1231B`}, `store size \(1231B\) must be larger than`}, {[]string{`--store=size=1231BLA`}, `unhandled size name: bla`}, + {[]string{`--store=ballast-size=60.0`}, `ballast size \(60.0\) must be between 0.000000% and 50.000000%`}, + {[]string{`--store=ballast-size=1231BLA`}, `unhandled size name: bla`}, + {[]string{`--store=ballast-size=0.5%,path=.`}, ``}, + {[]string{`--store=ballast-size=.5,path=.`}, ``}, + {[]string{`--store=ballast-size=50.%,path=.`}, ``}, + {[]string{`--store=ballast-size=50%,path=.`}, ``}, + {[]string{`--store=ballast-size=2GiB,path=.`}, ``}, {[]string{`--store=attrs=bli:bli`}, `duplicate attribute`}, {[]string{`--store=type=bli`}, `bli is not a valid store type`}, {[]string{`--store=bla=bli`}, `bla is not a valid store field`}, diff --git a/pkg/server/config.go b/pkg/server/config.go index 7922c286fc07..96dda3238f2a 100644 --- a/pkg/server/config.go +++ b/pkg/server/config.go @@ -529,11 +529,14 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) { engines = append(engines, e) } } else { + if err := vfs.Default.MkdirAll(spec.Path, 0755); err != nil { + return Engines{}, errors.Wrap(err, "creating store directory") + } + du, err := vfs.Default.GetDiskUsage(spec.Path) + if err != nil { + return Engines{}, errors.Wrap(err, "retrieving disk usage") + } if spec.Size.Percent > 0 { - du, err := vfs.Default.GetDiskUsage(spec.Path) - if err != nil { - return Engines{}, err - } sizeInBytes = int64(float64(du.TotalBytes) * spec.Size.Percent / 100) } if sizeInBytes != 0 && !skipSizeCheck && sizeInBytes < base.MinimumStoreSize { @@ -548,6 +551,7 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) { Attrs: spec.Attributes, Dir: spec.Path, MaxSize: sizeInBytes, + BallastSize: storage.BallastSizeBytes(spec, du), Settings: cfg.Settings, UseFileRegistry: spec.UseFileRegistry, DisableSeparatedIntents: disableSeparatedIntents, diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index 9c74d7b4c5ac..35c4d8cacff1 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -5,6 +5,7 @@ go_library( srcs = [ "array_32bit.go", "array_64bit.go", + "ballast.go", "batch.go", "disk_map.go", "doc.go", @@ -41,6 +42,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/base", + "//pkg/cli/exit", "//pkg/clusterversion", "//pkg/keys", "//pkg/kv/kvserver/concurrency/lock", @@ -63,6 +65,7 @@ go_library( "//pkg/util/protoutil", "//pkg/util/stop", "//pkg/util/syncutil", + "//pkg/util/sysutil", "//pkg/util/timeutil", "//pkg/util/tracing", "//pkg/util/uuid", @@ -84,6 +87,7 @@ go_test( name = "storage_test", size = "medium", srcs = [ + "ballast_test.go", "batch_test.go", "bench_pebble_test.go", "bench_test.go", @@ -137,6 +141,7 @@ go_test( "//pkg/util/randutil", "//pkg/util/shuffle", "//pkg/util/stop", + "//pkg/util/sysutil", "//pkg/util/timeutil", "//pkg/util/uint128", "//pkg/util/uuid", diff --git a/pkg/storage/ballast.go b/pkg/storage/ballast.go new file mode 100644 index 000000000000..c4bda616e0ec --- /dev/null +++ b/pkg/storage/ballast.go @@ -0,0 +1,153 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/util/envutil" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" +) + +// ballastsEnabled allows overriding the automatic creation of the ballast +// files through an environment variable. Developers working on CockroachDB +// may want to include `COCKROACH_AUTO_BALLAST=false` in their environment to +// prevent the automatic creation of large ballast files on their local +// filesystem. +var ballastsEnabled bool = envutil.EnvOrDefaultBool("COCKROACH_AUTO_BALLAST", true) + +// IsDiskFull examines the store indicated by spec, determining whether the +// store's underlying disk is out of disk space. A disk is considered to be +// full if available capacity is less than half of the store's ballast size. +// +// If the current on-disk ballast does not match the configured ballast size +// in spec, IsDiskFull will resize the file if available capacity allows. +func IsDiskFull(fs vfs.FS, spec base.StoreSpec) (bool, error) { + if spec.InMemory { + return false, nil + } + + // The store directory might not exist yet. We don't want to try to create + // it yet, because there might not be any disk space to do so. Check the + // disk usage on the first parent that exists. + path := spec.Path + diskUsage, err := fs.GetDiskUsage(path) + for oserror.IsNotExist(err) { + if parentPath := fs.PathDir(path); parentPath == path { + break + } else { + path = parentPath + } + diskUsage, err = fs.GetDiskUsage(path) + } + if err != nil { + return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path) + } + + // Try to resize the ballast now, if necessary. This is necessary to + // truncate the ballast if a new, lower ballast size was provided, + // and the disk space freed by truncation will allow us to start. If + // we need to create or grow the ballast but are unable because + // there's insufficient disk space, it'll be resized by the periodic + // capacity calculations when the conditions are met. + desiredSizeBytes := BallastSizeBytes(spec, diskUsage) + ballastPath := base.EmergencyBallastFile(fs.PathJoin, spec.Path) + if resized, err := maybeEstablishBallast(fs, ballastPath, desiredSizeBytes, diskUsage); err != nil { + return false, err + } else if resized { + diskUsage, err = fs.GetDiskUsage(path) + if err != nil { + return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path) + } + } + + // If the filesystem reports less than half the disk space available, + // consider the disk full. If the ballast hasn't been removed yet, + // removing it will free enough disk space to start. We don't use exactly + // the ballast size in case some of the headroom gets consumed elsewhere: + // eg, the operator's shell history, system logs, copy-on-write filesystem + // metadata, etc. + return diskUsage.AvailBytes < uint64(desiredSizeBytes/2), nil +} + +// BallastSizeBytes returns the desired size of the emergency ballast, +// calculated from the provided store spec and disk usage. If the store spec +// contains an explicit ballast size (either in bytes or as a percentage of +// the disk's total capacity), the store spec's size is used. Otherwise, +// BallastSizeBytes returns 1GiB or 1% of total capacity, whichever is +// smaller. +func BallastSizeBytes(spec base.StoreSpec, diskUsage vfs.DiskUsage) int64 { + if spec.BallastSize != nil { + v := spec.BallastSize.InBytes + if spec.BallastSize.Percent != 0 { + v = int64(float64(diskUsage.TotalBytes) * spec.BallastSize.Percent / 100) + } + return v + } + + // Default to a 1% or 1GiB ballast, whichever is smaller. + var v int64 = 1 << 30 // 1 GiB + if p := int64(float64(diskUsage.TotalBytes) * 0.01); v > p { + v = p + } + return v +} + +func maybeEstablishBallast( + fs vfs.FS, ballastPath string, ballastSizeBytes int64, diskUsage vfs.DiskUsage, +) (resized bool, err error) { + var currentSizeBytes int64 + fi, err := fs.Stat(ballastPath) + if err != nil && !oserror.IsNotExist(err) { + return false, err + } else if err == nil { + currentSizeBytes = fi.Size() + } + + switch { + case currentSizeBytes > ballastSizeBytes: + // If the current ballast is too big, shrink it regardless of current + // disk space availability. + // TODO(jackson): Expose Truncate on vfs.FS. + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + case currentSizeBytes < ballastSizeBytes && ballastsEnabled: + if err := fs.MkdirAll(fs.PathDir(ballastPath), 0755); err != nil { + return false, errors.Wrap(err, "creating data directory") + } + // We need to either create the ballast or extend the current ballast + // to make it larger. The ballast may have been intentionally removed + // to enable recovery. Only create/extend the ballast if there's + // sufficient disk space. + extendBytes := ballastSizeBytes - currentSizeBytes + + // If available disk space is >= 4x the required amount, create the + // ballast. + if extendBytes <= int64(diskUsage.AvailBytes)/4 { + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + } + + // If the user configured a really large ballast, we might not ever + // have >= 4x the required amount available. Larger ballast sizes (eg, + // 5%, 10%) are not unreasonably large, but it's possible that after + // recovery available capacity won't exceed 4x the ballast sizes (eg, + // 20%, 40%). Allow extending the ballast if we will have 10 GiB + // available after the extension to account for these large ballasts. + if int64(diskUsage.AvailBytes)-extendBytes > (10 << 30 /* 10 GiB */) { + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + } + + return false, nil + default: + return false, nil + } +} diff --git a/pkg/storage/ballast_test.go b/pkg/storage/ballast_test.go new file mode 100644 index 000000000000..42fdaa3c7faa --- /dev/null +++ b/pkg/storage/ballast_test.go @@ -0,0 +1,240 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. +package storage + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" + "github.com/stretchr/testify/require" +) + +func temporarilyEnableBallasts() func() { + prev := ballastsEnabled + ballastsEnabled = true + return func() { ballastsEnabled = prev } +} + +func TestBallastSizeBytes(t *testing.T) { + defer leaktest.AfterTest(t)() + + testCases := []struct { + base.StoreSpec + totalBytes uint64 + want int64 + }{ + { + StoreSpec: base.StoreSpec{}, + totalBytes: 500 << 30, // 500 GiB + want: 1 << 30, // 1 GiB + }, + { + StoreSpec: base.StoreSpec{}, + totalBytes: 25 << 30, // 25 GiB + want: 256 << 20, // 256 MiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{InBytes: 1 << 30 /* 1 GiB */}}, + totalBytes: 25 << 30, // 25 GiB + want: 1 << 30, // 1 GiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{Percent: 20}}, + totalBytes: 25 << 30, // 25 GiB + want: 5 << 30, // 5 GiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{Percent: 20}}, + totalBytes: 500 << 30, // 500 GiB + want: 100 << 30, // 100 GiB + }, + } + + for _, tc := range testCases { + du := vfs.DiskUsage{TotalBytes: tc.totalBytes} + got := BallastSizeBytes(tc.StoreSpec, du) + require.Equal(t, tc.want, got) + } + +} + +func TestIsDiskFull(t *testing.T) { + defer leaktest.AfterTest(t)() + defer temporarilyEnableBallasts()() + + // TODO(jackson): This test could be adapted to use a MemFS if we add + // Truncate to vfs.FS. + + setup := func(t *testing.T, spec *base.StoreSpec, ballastSize int64, du ...vfs.DiskUsage) (vfs.FS, func()) { + dir, dirCleanupFn := testutils.TempDir(t) + fs := mockDiskUsageFS{ + FS: vfs.Default, + diskUsages: du, + } + spec.Path = dir + + if ballastSize > 0 { + path := base.EmergencyBallastFile(fs.PathJoin, spec.Path) + require.NoError(t, fs.MkdirAll(fs.PathDir(path), 0755)) + err := sysutil.ResizeLargeFile(path, ballastSize) + fmt.Printf("Created ballast at %s\n", path) + require.NoError(t, err) + } + return &fs, dirCleanupFn + } + + t.Run("default ballast, full disk", func(t *testing.T) { + spec := base.StoreSpec{ + // NB: A missing ballast size defaults to Min(1GiB, 1% of + // total) = 1GiB, which is greater than available bytes. + BallastSize: nil, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: (1 << 28), // 256 MiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.True(t, got) + }) + t.Run("default ballast, plenty of space", func(t *testing.T) { + spec := base.StoreSpec{ + // NB: A missing ballast size defaults to Min(1GiB, 1% of + // total) = 1GiB which is greater than available bytes. + BallastSize: nil, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: 25 << 30, // 25 GiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + }) + t.Run("truncating ballast frees enough space", func(t *testing.T) { + spec := base.StoreSpec{ + BallastSize: &base.SizeSpec{InBytes: 1024}, + } + // Provide two disk usages. The second one will be returned + // post-truncation. + fs, cleanup := setup(t, &spec, 2048, /* ballastSize */ + vfs.DiskUsage{AvailBytes: 256, TotalBytes: 500 << 30 /* 500 GiB */}, + vfs.DiskUsage{AvailBytes: 1280, TotalBytes: 500 << 30 /* 500 GiB */}) + defer cleanup() + + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + // The ballast should've been truncated. + fi, err := fs.Stat(base.EmergencyBallastFile(fs.PathJoin, spec.Path)) + require.NoError(t, err) + require.Equal(t, int64(1024), fi.Size()) + }) + t.Run("configured ballast, plenty of space", func(t *testing.T) { + spec := base.StoreSpec{ + BallastSize: &base.SizeSpec{InBytes: 5 << 30 /* 5 GiB */}, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: 25 << 30, // 25 GiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + }) +} + +type mockDiskUsageFS struct { + vfs.FS + diskUsagesIdx int + diskUsages []vfs.DiskUsage +} + +func (fs *mockDiskUsageFS) GetDiskUsage(string) (vfs.DiskUsage, error) { + ret := fs.diskUsages[fs.diskUsagesIdx] + if fs.diskUsagesIdx+1 < len(fs.diskUsages) { + fs.diskUsagesIdx++ + } + return ret, nil +} + +func TestMaybeEstablishBallast(t *testing.T) { + defer leaktest.AfterTest(t)() + defer temporarilyEnableBallasts()() + + setup := func(t *testing.T, ballastSize int64) (string, func()) { + dir, dirCleanupFn := testutils.TempDir(t) + path := filepath.Join(dir, "ballast") + if ballastSize > 0 { + err := sysutil.ResizeLargeFile(path, ballastSize) + require.NoError(t, err) + } + return path, dirCleanupFn + } + getSize := func(t *testing.T, path string) int { + fi, err := vfs.Default.Stat(path) + if oserror.IsNotExist(err) { + return 0 + } + require.NoError(t, err) + return int(fi.Size()) + } + + t.Run("insufficient disk space, no ballast", func(t *testing.T) { + path, cleanup := setup(t, 0) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1<<30 /* 1 GiB */, vfs.DiskUsage{ + AvailBytes: 3 << 30, /* 3 GiB */ + }) + require.NoError(t, err) + require.False(t, resized) + require.Equal(t, 0, getSize(t, path)) + }) + t.Run("sufficient disk space, no ballast", func(t *testing.T) { + path, cleanup := setup(t, 0) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1024, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.True(t, resized) + require.Equal(t, 1024, getSize(t, path)) + }) + t.Run("truncates ballast if necessary", func(t *testing.T) { + path, cleanup := setup(t, 2048) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1024, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.True(t, resized) + require.Equal(t, 1024, getSize(t, path)) + }) + t.Run("does nothing if ballast is correct size", func(t *testing.T) { + path, cleanup := setup(t, 4096) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 4096, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.False(t, resized) + require.Equal(t, 4096, getSize(t, path)) + }) +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 2f86fc4132a1..9793f868b9a8 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -27,6 +27,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/cli/exit" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -389,6 +390,10 @@ func DefaultPebbleOptions() *pebble.Options { if diskHealthCheckInterval.Seconds() > maxSyncDurationDefault.Seconds() { diskHealthCheckInterval = maxSyncDurationDefault } + // If we encounter ENOSPC, exit with an informative exit code. + opts.FS = vfs.OnDiskFull(opts.FS, func() { + exit.WithCode(exit.DiskFull()) + }) // Instantiate a file system with disk health checking enabled. This FS wraps // vfs.Default, and can be wrapped for encryption-at-rest. opts.FS = vfs.WithDiskHealthChecks(vfs.Default, diskHealthCheckInterval, @@ -443,11 +448,14 @@ type EncryptionStatsHandler interface { type Pebble struct { db *pebble.DB - closed bool - path string - auxDir string - maxSize int64 - attrs roachpb.Attributes + closed bool + readOnly bool + path string + auxDir string + ballastPath string + ballastSize int64 + maxSize int64 + attrs roachpb.Attributes // settings must be non-nil if this Pebble instance will be used to write // intents. settings *cluster.Settings @@ -558,6 +566,7 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) { if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil { return nil, err } + ballastPath := base.EmergencyBallastFile(cfg.Opts.FS.PathJoin, cfg.Dir) fileRegistry, statsHandler, err := ResolveEncryptedEnvOptions(&cfg) if err != nil { @@ -577,9 +586,34 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) { ctx: logCtx, depth: 1, } + + // Establish the emergency ballast if we can. If there's not sufficient + // disk space, the ballast will be reestablished from Capacity when the + // store's capacity is queried periodically. + if !cfg.Opts.ReadOnly { + du, err := cfg.Opts.FS.GetDiskUsage(cfg.Dir) + // If the FS is an in-memory FS, GetDiskUsage returns + // vfs.ErrUnsupported and we skip ballast creation. + if err != nil && !errors.Is(err, vfs.ErrUnsupported) { + return nil, errors.Wrap(err, "retrieving disk usage") + } else if err == nil { + resized, err := maybeEstablishBallast(cfg.Opts.FS, ballastPath, cfg.BallastSize, du) + if err != nil { + return nil, errors.Wrap(err, "resizing ballast") + } + if resized { + cfg.Opts.Logger.Infof("resized ballast %s to size %s", + ballastPath, humanizeutil.IBytes(cfg.BallastSize)) + } + } + } + p := &Pebble{ + readOnly: cfg.Opts.ReadOnly, path: cfg.Dir, auxDir: auxDir, + ballastPath: ballastPath, + ballastSize: cfg.BallastSize, maxSize: cfg.MaxSize, attrs: cfg.Attrs, settings: cfg.Settings, @@ -951,7 +985,15 @@ func (p *Pebble) Attrs() roachpb.Attributes { // Capacity implements the Engine interface. func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { dir := p.path - if dir == "" { + if dir != "" { + var err error + // Eval directory if it is a symbolic links. + if dir, err = filepath.EvalSymlinks(dir); err != nil { + return roachpb.StoreCapacity{}, err + } + } + du, err := p.fs.GetDiskUsage(dir) + if errors.Is(err, vfs.ErrUnsupported) { // This is an in-memory instance. Pretend we're empty since we // don't know better and only use this for testing. Using any // part of the actual file system here can throw off allocator @@ -960,14 +1002,7 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { Capacity: p.maxSize, Available: p.maxSize, }, nil - } - var err error - // Eval directory if it is a symbolic links. - if dir, err = filepath.EvalSymlinks(dir); err != nil { - return roachpb.StoreCapacity{}, err - } - du, err := p.fs.GetDiskUsage(dir) - if err != nil { + } else if err != nil { return roachpb.StoreCapacity{}, err } @@ -982,6 +1017,25 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { fsuTotal := int64(du.TotalBytes) fsuAvail := int64(du.AvailBytes) + // If the emergency ballast isn't appropriately sized, try to resize it. + // This is a no-op if the ballast is already sized or if there's not + // enough available capacity to resize it. Capacity is called periodically + // by the kvserver, and that drives the automatic resizing of the ballast. + if !p.readOnly { + resized, err := maybeEstablishBallast(p.fs, p.ballastPath, p.ballastSize, du) + if err != nil { + return roachpb.StoreCapacity{}, errors.Wrap(err, "resizing ballast") + } + if resized { + p.logger.Infof("resized ballast %s to size %s", + p.ballastPath, humanizeutil.IBytes(p.ballastSize)) + du, err = p.fs.GetDiskUsage(dir) + if err != nil { + return roachpb.StoreCapacity{}, err + } + } + } + // Pebble has detailed accounting of its own disk space usage, and it's // incrementally updated which helps avoid O(# files) work here. m := p.db.Metrics() @@ -1005,6 +1059,12 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { } return err } + if path == p.ballastPath { + // Skip the ballast. Counting it as used is likely to confuse + // users, and it's more akin to space that is just unavailable + // like disk space often restricted to a root user. + return nil + } if info.Mode().IsRegular() { totalUsedBytes += info.Size() }