diff --git a/build/teamcity-support.sh b/build/teamcity-support.sh index 2fb0212ad1bf..ab3a5273b028 100644 --- a/build/teamcity-support.sh +++ b/build/teamcity-support.sh @@ -102,6 +102,12 @@ function run_json_test() { rm -f "${fullfile}" fi rm -f "${tmpfile}" artifacts/stripped.txt + + # Some unit tests test automatic ballast creation. These ballasts can be + # larger than the maximum artifact size. Remove any artifacts with the + # EMERGENCY_BALLAST filename. + find artifacts -name "EMERGENCY_BALLAST" -delete + tc_end_block "artifacts" # Make it easier to figure out whether we're exiting because of a test failure diff --git a/pkg/base/config.go b/pkg/base/config.go index 6929b69909d9..861e1ed955c3 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -510,6 +510,9 @@ type StorageConfig struct { // MaxSize is used for calculating free space and making rebalancing // decisions. Zero indicates that there is no maximum size. MaxSize int64 + // BallastSize is the amount reserved by a ballast file for manual + // out-of-disk recovery. + BallastSize int64 // Settings instance for cluster-wide knobs. Settings *cluster.Settings // UseFileRegistry is true if the file registry is needed (eg: encryption-at-rest). diff --git a/pkg/base/store_spec.go b/pkg/base/store_spec.go index 52e1d0b8ddc2..6ce9e9c89724 100644 --- a/pkg/base/store_spec.go +++ b/pkg/base/store_spec.go @@ -29,6 +29,7 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" "github.com/cockroachdb/pebble" + "github.com/cockroachdb/redact" humanize "github.com/dustin/go-humanize" "github.com/spf13/pflag" ) @@ -79,7 +80,7 @@ type floatInterval struct { // NewSizeSpec parses the string passed into a --size flag and returns a // SizeSpec if it is correctly parsed. func NewSizeSpec( - value string, bytesRange *intInterval, percentRange *floatInterval, + field redact.SafeString, value string, bytesRange *intInterval, percentRange *floatInterval, ) (SizeSpec, error) { var size SizeSpec if fractionRegex.MatchString(value) { @@ -93,13 +94,14 @@ func NewSizeSpec( size.Percent, err = strconv.ParseFloat(factorValue, 64) size.Percent *= percentFactor if err != nil { - return SizeSpec{}, fmt.Errorf("could not parse store size (%s) %s", value, err) + return SizeSpec{}, errors.Newf("could not parse %s size (%s) %s", field, value, err) } if percentRange != nil { if (percentRange.min != nil && size.Percent < *percentRange.min) || (percentRange.max != nil && size.Percent > *percentRange.max) { - return SizeSpec{}, fmt.Errorf( - "store size (%s) must be between %f%% and %f%%", + return SizeSpec{}, errors.Newf( + "%s size (%s) must be between %f%% and %f%%", + field, value, *percentRange.min, *percentRange.max, @@ -110,16 +112,16 @@ func NewSizeSpec( var err error size.InBytes, err = humanizeutil.ParseBytes(value) if err != nil { - return SizeSpec{}, fmt.Errorf("could not parse store size (%s) %s", value, err) + return SizeSpec{}, errors.Newf("could not parse %s size (%s) %s", field, value, err) } if bytesRange != nil { if bytesRange.min != nil && size.InBytes < *bytesRange.min { - return SizeSpec{}, fmt.Errorf("store size (%s) must be larger than %s", value, - humanizeutil.IBytes(*bytesRange.min)) + return SizeSpec{}, errors.Newf("%s size (%s) must be larger than %s", + field, value, humanizeutil.IBytes(*bytesRange.min)) } if bytesRange.max != nil && size.InBytes > *bytesRange.max { - return SizeSpec{}, fmt.Errorf("store size (%s) must be smaller than %s", value, - humanizeutil.IBytes(*bytesRange.max)) + return SizeSpec{}, errors.Newf("%s size (%s) must be smaller than %s", + field, value, humanizeutil.IBytes(*bytesRange.max)) } } } @@ -150,7 +152,7 @@ var _ pflag.Value = &SizeSpec{} // Set adds a new value to the StoreSpecValue. It is the important part of // pflag's value interface. func (ss *SizeSpec) Set(value string) error { - spec, err := NewSizeSpec(value, nil, nil) + spec, err := NewSizeSpec("specified", value, nil, nil) if err != nil { return err } @@ -162,10 +164,11 @@ func (ss *SizeSpec) Set(value string) error { // StoreSpec contains the details that can be specified in the cli pertaining // to the --store flag. type StoreSpec struct { - Path string - Size SizeSpec - InMemory bool - Attributes roachpb.Attributes + Path string + Size SizeSpec + BallastSize *SizeSpec + InMemory bool + Attributes roachpb.Attributes // StickyInMemoryEngineID is a unique identifier associated with a given // store which will remain in memory even after the default Engine close // until it has been explicitly cleaned up by CleanupStickyInMemEngine[s] @@ -190,6 +193,7 @@ type StoreSpec struct { // String returns a fully parsable version of the store spec. func (ss StoreSpec) String() string { + // TODO(jackson): Implement redact.SafeFormatter var buffer bytes.Buffer if len(ss.Path) != 0 { fmt.Fprintf(&buffer, "path=%s,", ss.Path) @@ -203,6 +207,14 @@ func (ss StoreSpec) String() string { if ss.Size.Percent > 0 { fmt.Fprintf(&buffer, "size=%s%%,", humanize.Ftoa(ss.Size.Percent)) } + if ss.BallastSize != nil { + if ss.BallastSize.InBytes > 0 { + fmt.Fprintf(&buffer, "ballast-size=%s,", humanizeutil.IBytes(ss.BallastSize.InBytes)) + } + if ss.BallastSize.Percent > 0 { + fmt.Fprintf(&buffer, "ballast-size=%s%%,", humanize.Ftoa(ss.BallastSize.Percent)) + } + } if len(ss.Attributes.Attrs) > 0 { fmt.Fprint(&buffer, "attrs=") for i, attr := range ss.Attributes.Attrs { @@ -308,6 +320,7 @@ func NewStoreSpec(value string) (StoreSpec, error) { var minPercent float64 = 1 var maxPercent float64 = 100 ss.Size, err = NewSizeSpec( + "store", value, &intInterval{min: &minBytesAllowed}, &floatInterval{min: &minPercent, max: &maxPercent}, @@ -315,6 +328,20 @@ func NewStoreSpec(value string) (StoreSpec, error) { if err != nil { return StoreSpec{}, err } + case "ballast-size": + var minBytesAllowed int64 + var minPercent float64 = 0 + var maxPercent float64 = 50 + ballastSize, err := NewSizeSpec( + "ballast", + value, + &intInterval{min: &minBytesAllowed}, + &floatInterval{min: &minPercent, max: &maxPercent}, + ) + if err != nil { + return StoreSpec{}, err + } + ss.BallastSize = &ballastSize case "attrs": // Check to make sure there are no duplicate attributes. attrMap := make(map[string]struct{}) @@ -384,6 +411,9 @@ func NewStoreSpec(value string) (StoreSpec, error) { if ss.Size.Percent == 0 && ss.Size.InBytes == 0 { return StoreSpec{}, fmt.Errorf("size must be specified for an in memory store") } + if ss.BallastSize != nil { + return StoreSpec{}, fmt.Errorf("ballast-size specified for in memory store") + } } else if ss.Path == "" { return StoreSpec{}, fmt.Errorf("no path specified") } @@ -417,6 +447,14 @@ func (ssl StoreSpecList) String() string { // root directory. It must not be changed without a proper migration. const AuxiliaryDir = "auxiliary" +// EmergencyBallastFile returns the path (relative to a data directory) used +// for an emergency ballast file. The returned path must be stable across +// releases (eg, we cannot change these constants), otherwise we may duplicate +// ballasts. +func EmergencyBallastFile(pathJoin func(...string) string, dataDir string) string { + return pathJoin(dataDir, AuxiliaryDir, "EMERGENCY_BALLAST") +} + // PreventedStartupFile is the filename (relative to 'dir') used for files that // can block server startup. func PreventedStartupFile(dir string) string { diff --git a/pkg/base/store_spec_test.go b/pkg/base/store_spec_test.go index 8ad799f06575..046a5d2cb7b1 100644 --- a/pkg/base/store_spec_test.go +++ b/pkg/base/store_spec_test.go @@ -128,6 +128,13 @@ target_file_size=2097152` {"size=20GiB,path=/mnt/hda1,size=20GiB", "size field was used twice in store definition", StoreSpec{}}, {"size=123TB", "no path specified", StoreSpec{}}, + // ballast size + {"path=/mnt/hda1,ballast-size=671088640", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{InBytes: 671088640}}}, + {"path=/mnt/hda1,ballast-size=20GB", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{InBytes: 20000000000}}}, + {"path=/mnt/hda1,ballast-size=1%", "", StoreSpec{Path: "/mnt/hda1", BallastSize: &SizeSpec{Percent: 1}}}, + {"path=/mnt/hda1,ballast-size=100.000%", "ballast size (100.000%) must be between 0.000000% and 50.000000%", StoreSpec{}}, + {"ballast-size=20GiB,path=/mnt/hda1,ballast-size=20GiB", "ballast-size field was used twice in store definition", StoreSpec{}}, + // type {"type=mem,size=20GiB", "", StoreSpec{Size: SizeSpec{InBytes: 21474836480}, InMemory: true}}, {"size=20GiB,type=mem", "", StoreSpec{Size: SizeSpec{InBytes: 21474836480}, InMemory: true}}, diff --git a/pkg/cli/exit/codes.go b/pkg/cli/exit/codes.go index ac41c99f8626..bc96e85e46b6 100644 --- a/pkg/cli/exit/codes.go +++ b/pkg/cli/exit/codes.go @@ -58,6 +58,10 @@ func TimeoutAfterFatalError() Code { return Code{8} } // during a logging operation to a network collector. func LoggingNetCollectorUnavailable() Code { return Code{9} } +// DiskFull (10) indicates an emergency shutdown in response to a +// store's full disk. +func DiskFull() Code { return Code{10} } + // Codes that are specific to client commands follow. It's possible // for codes to be reused across separate client or server commands. // Command-specific exit codes should be allocated down from 125. diff --git a/pkg/cli/start.go b/pkg/cli/start.go index aafc45c232e6..68b3e877ad11 100644 --- a/pkg/cli/start.go +++ b/pkg/cli/start.go @@ -55,6 +55,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" "github.com/cockroachdb/redact" "github.com/spf13/cobra" "google.golang.org/grpc" @@ -347,6 +349,16 @@ func runStart(cmd *cobra.Command, args []string, startSingleNode bool) (returnEr }() } + // Check for stores with full disks and exit with an informative exit + // code. This needs to happen early during start, before we perform any + // writes to the filesystem including log rotation. We need to guarantee + // that the process continues to exit with the Disk Full exit code. A + // flapping exit code can affect alerting, including the alerting + // performed within CockroachCloud. + if err := exitIfDiskFull(serverCfg.Stores.Specs); err != nil { + return err + } + // Set up a cancellable context for the entire start command. // The context will be canceled at the end. ctx, cancel := context.WithCancel(context.Background()) @@ -1015,6 +1027,46 @@ func maybeWarnMemorySizes(ctx context.Context) { } } +func exitIfDiskFull(specs []base.StoreSpec) error { + var cause error + var ballastPaths []string + var ballastMissing bool + for _, spec := range specs { + isDiskFull, err := storage.IsDiskFull(vfs.Default, spec) + if err != nil { + return err + } + if !isDiskFull { + continue + } + path := base.EmergencyBallastFile(vfs.Default.PathJoin, spec.Path) + ballastPaths = append(ballastPaths, path) + if _, err := vfs.Default.Stat(path); oserror.IsNotExist(err) { + ballastMissing = true + } + cause = errors.CombineErrors(cause, errors.Newf(`store %s: out of disk space`, spec.Path)) + } + if cause == nil { + return nil + } + + // TODO(jackson): Link to documentation surrounding the ballast. + + err := clierror.NewError(cause, exit.DiskFull()) + if ballastMissing { + return errors.WithHint(err, `At least one ballast file is missing. +You may need to replace this node because there is +insufficient disk space to start.`) + } + + ballastPathsStr := strings.Join(ballastPaths, "\n") + err = errors.WithHintf(err, `Deleting or truncating the ballast file(s) at +%s +may reclaim enough space to start. Proceed with caution. Complete +disk space exhaustion may result in node loss.`, ballastPathsStr) + return err +} + // setupAndInitializeLoggingAndProfiling does what it says on the label. // Prior to this however it determines suitable defaults for the // logging output directory and the verbosity level of stderr logging. diff --git a/pkg/cli/start_test.go b/pkg/cli/start_test.go index 11f451afc613..542839c48205 100644 --- a/pkg/cli/start_test.go +++ b/pkg/cli/start_test.go @@ -104,6 +104,13 @@ func TestStartArgChecking(t *testing.T) { {[]string{`--store=size=-1231MB`}, `store size \(-1231MB\) must be larger than`}, {[]string{`--store=size=1231B`}, `store size \(1231B\) must be larger than`}, {[]string{`--store=size=1231BLA`}, `unhandled size name: bla`}, + {[]string{`--store=ballast-size=60.0`}, `ballast size \(60.0\) must be between 0.000000% and 50.000000%`}, + {[]string{`--store=ballast-size=1231BLA`}, `unhandled size name: bla`}, + {[]string{`--store=ballast-size=0.5%,path=.`}, ``}, + {[]string{`--store=ballast-size=.5,path=.`}, ``}, + {[]string{`--store=ballast-size=50.%,path=.`}, ``}, + {[]string{`--store=ballast-size=50%,path=.`}, ``}, + {[]string{`--store=ballast-size=2GiB,path=.`}, ``}, {[]string{`--store=attrs=bli:bli`}, `duplicate attribute`}, {[]string{`--store=type=bli`}, `bli is not a valid store type`}, {[]string{`--store=bla=bli`}, `bla is not a valid store field`}, diff --git a/pkg/server/config.go b/pkg/server/config.go index 7922c286fc07..96dda3238f2a 100644 --- a/pkg/server/config.go +++ b/pkg/server/config.go @@ -529,11 +529,14 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) { engines = append(engines, e) } } else { + if err := vfs.Default.MkdirAll(spec.Path, 0755); err != nil { + return Engines{}, errors.Wrap(err, "creating store directory") + } + du, err := vfs.Default.GetDiskUsage(spec.Path) + if err != nil { + return Engines{}, errors.Wrap(err, "retrieving disk usage") + } if spec.Size.Percent > 0 { - du, err := vfs.Default.GetDiskUsage(spec.Path) - if err != nil { - return Engines{}, err - } sizeInBytes = int64(float64(du.TotalBytes) * spec.Size.Percent / 100) } if sizeInBytes != 0 && !skipSizeCheck && sizeInBytes < base.MinimumStoreSize { @@ -548,6 +551,7 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) { Attrs: spec.Attributes, Dir: spec.Path, MaxSize: sizeInBytes, + BallastSize: storage.BallastSizeBytes(spec, du), Settings: cfg.Settings, UseFileRegistry: spec.UseFileRegistry, DisableSeparatedIntents: disableSeparatedIntents, diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index 9c74d7b4c5ac..35c4d8cacff1 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -5,6 +5,7 @@ go_library( srcs = [ "array_32bit.go", "array_64bit.go", + "ballast.go", "batch.go", "disk_map.go", "doc.go", @@ -41,6 +42,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/base", + "//pkg/cli/exit", "//pkg/clusterversion", "//pkg/keys", "//pkg/kv/kvserver/concurrency/lock", @@ -63,6 +65,7 @@ go_library( "//pkg/util/protoutil", "//pkg/util/stop", "//pkg/util/syncutil", + "//pkg/util/sysutil", "//pkg/util/timeutil", "//pkg/util/tracing", "//pkg/util/uuid", @@ -84,6 +87,7 @@ go_test( name = "storage_test", size = "medium", srcs = [ + "ballast_test.go", "batch_test.go", "bench_pebble_test.go", "bench_test.go", @@ -137,6 +141,7 @@ go_test( "//pkg/util/randutil", "//pkg/util/shuffle", "//pkg/util/stop", + "//pkg/util/sysutil", "//pkg/util/timeutil", "//pkg/util/uint128", "//pkg/util/uuid", diff --git a/pkg/storage/ballast.go b/pkg/storage/ballast.go new file mode 100644 index 000000000000..c4bda616e0ec --- /dev/null +++ b/pkg/storage/ballast.go @@ -0,0 +1,153 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage + +import ( + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/util/envutil" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" +) + +// ballastsEnabled allows overriding the automatic creation of the ballast +// files through an environment variable. Developers working on CockroachDB +// may want to include `COCKROACH_AUTO_BALLAST=false` in their environment to +// prevent the automatic creation of large ballast files on their local +// filesystem. +var ballastsEnabled bool = envutil.EnvOrDefaultBool("COCKROACH_AUTO_BALLAST", true) + +// IsDiskFull examines the store indicated by spec, determining whether the +// store's underlying disk is out of disk space. A disk is considered to be +// full if available capacity is less than half of the store's ballast size. +// +// If the current on-disk ballast does not match the configured ballast size +// in spec, IsDiskFull will resize the file if available capacity allows. +func IsDiskFull(fs vfs.FS, spec base.StoreSpec) (bool, error) { + if spec.InMemory { + return false, nil + } + + // The store directory might not exist yet. We don't want to try to create + // it yet, because there might not be any disk space to do so. Check the + // disk usage on the first parent that exists. + path := spec.Path + diskUsage, err := fs.GetDiskUsage(path) + for oserror.IsNotExist(err) { + if parentPath := fs.PathDir(path); parentPath == path { + break + } else { + path = parentPath + } + diskUsage, err = fs.GetDiskUsage(path) + } + if err != nil { + return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path) + } + + // Try to resize the ballast now, if necessary. This is necessary to + // truncate the ballast if a new, lower ballast size was provided, + // and the disk space freed by truncation will allow us to start. If + // we need to create or grow the ballast but are unable because + // there's insufficient disk space, it'll be resized by the periodic + // capacity calculations when the conditions are met. + desiredSizeBytes := BallastSizeBytes(spec, diskUsage) + ballastPath := base.EmergencyBallastFile(fs.PathJoin, spec.Path) + if resized, err := maybeEstablishBallast(fs, ballastPath, desiredSizeBytes, diskUsage); err != nil { + return false, err + } else if resized { + diskUsage, err = fs.GetDiskUsage(path) + if err != nil { + return false, errors.Wrapf(err, "retrieving disk usage: %s", spec.Path) + } + } + + // If the filesystem reports less than half the disk space available, + // consider the disk full. If the ballast hasn't been removed yet, + // removing it will free enough disk space to start. We don't use exactly + // the ballast size in case some of the headroom gets consumed elsewhere: + // eg, the operator's shell history, system logs, copy-on-write filesystem + // metadata, etc. + return diskUsage.AvailBytes < uint64(desiredSizeBytes/2), nil +} + +// BallastSizeBytes returns the desired size of the emergency ballast, +// calculated from the provided store spec and disk usage. If the store spec +// contains an explicit ballast size (either in bytes or as a percentage of +// the disk's total capacity), the store spec's size is used. Otherwise, +// BallastSizeBytes returns 1GiB or 1% of total capacity, whichever is +// smaller. +func BallastSizeBytes(spec base.StoreSpec, diskUsage vfs.DiskUsage) int64 { + if spec.BallastSize != nil { + v := spec.BallastSize.InBytes + if spec.BallastSize.Percent != 0 { + v = int64(float64(diskUsage.TotalBytes) * spec.BallastSize.Percent / 100) + } + return v + } + + // Default to a 1% or 1GiB ballast, whichever is smaller. + var v int64 = 1 << 30 // 1 GiB + if p := int64(float64(diskUsage.TotalBytes) * 0.01); v > p { + v = p + } + return v +} + +func maybeEstablishBallast( + fs vfs.FS, ballastPath string, ballastSizeBytes int64, diskUsage vfs.DiskUsage, +) (resized bool, err error) { + var currentSizeBytes int64 + fi, err := fs.Stat(ballastPath) + if err != nil && !oserror.IsNotExist(err) { + return false, err + } else if err == nil { + currentSizeBytes = fi.Size() + } + + switch { + case currentSizeBytes > ballastSizeBytes: + // If the current ballast is too big, shrink it regardless of current + // disk space availability. + // TODO(jackson): Expose Truncate on vfs.FS. + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + case currentSizeBytes < ballastSizeBytes && ballastsEnabled: + if err := fs.MkdirAll(fs.PathDir(ballastPath), 0755); err != nil { + return false, errors.Wrap(err, "creating data directory") + } + // We need to either create the ballast or extend the current ballast + // to make it larger. The ballast may have been intentionally removed + // to enable recovery. Only create/extend the ballast if there's + // sufficient disk space. + extendBytes := ballastSizeBytes - currentSizeBytes + + // If available disk space is >= 4x the required amount, create the + // ballast. + if extendBytes <= int64(diskUsage.AvailBytes)/4 { + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + } + + // If the user configured a really large ballast, we might not ever + // have >= 4x the required amount available. Larger ballast sizes (eg, + // 5%, 10%) are not unreasonably large, but it's possible that after + // recovery available capacity won't exceed 4x the ballast sizes (eg, + // 20%, 40%). Allow extending the ballast if we will have 10 GiB + // available after the extension to account for these large ballasts. + if int64(diskUsage.AvailBytes)-extendBytes > (10 << 30 /* 10 GiB */) { + return true, sysutil.ResizeLargeFile(ballastPath, ballastSizeBytes) + } + + return false, nil + default: + return false, nil + } +} diff --git a/pkg/storage/ballast_test.go b/pkg/storage/ballast_test.go new file mode 100644 index 000000000000..42fdaa3c7faa --- /dev/null +++ b/pkg/storage/ballast_test.go @@ -0,0 +1,240 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. +package storage + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/sysutil" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/vfs" + "github.com/stretchr/testify/require" +) + +func temporarilyEnableBallasts() func() { + prev := ballastsEnabled + ballastsEnabled = true + return func() { ballastsEnabled = prev } +} + +func TestBallastSizeBytes(t *testing.T) { + defer leaktest.AfterTest(t)() + + testCases := []struct { + base.StoreSpec + totalBytes uint64 + want int64 + }{ + { + StoreSpec: base.StoreSpec{}, + totalBytes: 500 << 30, // 500 GiB + want: 1 << 30, // 1 GiB + }, + { + StoreSpec: base.StoreSpec{}, + totalBytes: 25 << 30, // 25 GiB + want: 256 << 20, // 256 MiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{InBytes: 1 << 30 /* 1 GiB */}}, + totalBytes: 25 << 30, // 25 GiB + want: 1 << 30, // 1 GiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{Percent: 20}}, + totalBytes: 25 << 30, // 25 GiB + want: 5 << 30, // 5 GiB + }, + { + StoreSpec: base.StoreSpec{BallastSize: &base.SizeSpec{Percent: 20}}, + totalBytes: 500 << 30, // 500 GiB + want: 100 << 30, // 100 GiB + }, + } + + for _, tc := range testCases { + du := vfs.DiskUsage{TotalBytes: tc.totalBytes} + got := BallastSizeBytes(tc.StoreSpec, du) + require.Equal(t, tc.want, got) + } + +} + +func TestIsDiskFull(t *testing.T) { + defer leaktest.AfterTest(t)() + defer temporarilyEnableBallasts()() + + // TODO(jackson): This test could be adapted to use a MemFS if we add + // Truncate to vfs.FS. + + setup := func(t *testing.T, spec *base.StoreSpec, ballastSize int64, du ...vfs.DiskUsage) (vfs.FS, func()) { + dir, dirCleanupFn := testutils.TempDir(t) + fs := mockDiskUsageFS{ + FS: vfs.Default, + diskUsages: du, + } + spec.Path = dir + + if ballastSize > 0 { + path := base.EmergencyBallastFile(fs.PathJoin, spec.Path) + require.NoError(t, fs.MkdirAll(fs.PathDir(path), 0755)) + err := sysutil.ResizeLargeFile(path, ballastSize) + fmt.Printf("Created ballast at %s\n", path) + require.NoError(t, err) + } + return &fs, dirCleanupFn + } + + t.Run("default ballast, full disk", func(t *testing.T) { + spec := base.StoreSpec{ + // NB: A missing ballast size defaults to Min(1GiB, 1% of + // total) = 1GiB, which is greater than available bytes. + BallastSize: nil, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: (1 << 28), // 256 MiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.True(t, got) + }) + t.Run("default ballast, plenty of space", func(t *testing.T) { + spec := base.StoreSpec{ + // NB: A missing ballast size defaults to Min(1GiB, 1% of + // total) = 1GiB which is greater than available bytes. + BallastSize: nil, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: 25 << 30, // 25 GiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + }) + t.Run("truncating ballast frees enough space", func(t *testing.T) { + spec := base.StoreSpec{ + BallastSize: &base.SizeSpec{InBytes: 1024}, + } + // Provide two disk usages. The second one will be returned + // post-truncation. + fs, cleanup := setup(t, &spec, 2048, /* ballastSize */ + vfs.DiskUsage{AvailBytes: 256, TotalBytes: 500 << 30 /* 500 GiB */}, + vfs.DiskUsage{AvailBytes: 1280, TotalBytes: 500 << 30 /* 500 GiB */}) + defer cleanup() + + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + // The ballast should've been truncated. + fi, err := fs.Stat(base.EmergencyBallastFile(fs.PathJoin, spec.Path)) + require.NoError(t, err) + require.Equal(t, int64(1024), fi.Size()) + }) + t.Run("configured ballast, plenty of space", func(t *testing.T) { + spec := base.StoreSpec{ + BallastSize: &base.SizeSpec{InBytes: 5 << 30 /* 5 GiB */}, + } + fs, cleanup := setup(t, &spec, 0 /* ballastSize */, vfs.DiskUsage{ + AvailBytes: 25 << 30, // 25 GiB + TotalBytes: 500 << 30, // 500 GiB + }) + defer cleanup() + got, err := IsDiskFull(fs, spec) + require.NoError(t, err) + require.False(t, got) + }) +} + +type mockDiskUsageFS struct { + vfs.FS + diskUsagesIdx int + diskUsages []vfs.DiskUsage +} + +func (fs *mockDiskUsageFS) GetDiskUsage(string) (vfs.DiskUsage, error) { + ret := fs.diskUsages[fs.diskUsagesIdx] + if fs.diskUsagesIdx+1 < len(fs.diskUsages) { + fs.diskUsagesIdx++ + } + return ret, nil +} + +func TestMaybeEstablishBallast(t *testing.T) { + defer leaktest.AfterTest(t)() + defer temporarilyEnableBallasts()() + + setup := func(t *testing.T, ballastSize int64) (string, func()) { + dir, dirCleanupFn := testutils.TempDir(t) + path := filepath.Join(dir, "ballast") + if ballastSize > 0 { + err := sysutil.ResizeLargeFile(path, ballastSize) + require.NoError(t, err) + } + return path, dirCleanupFn + } + getSize := func(t *testing.T, path string) int { + fi, err := vfs.Default.Stat(path) + if oserror.IsNotExist(err) { + return 0 + } + require.NoError(t, err) + return int(fi.Size()) + } + + t.Run("insufficient disk space, no ballast", func(t *testing.T) { + path, cleanup := setup(t, 0) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1<<30 /* 1 GiB */, vfs.DiskUsage{ + AvailBytes: 3 << 30, /* 3 GiB */ + }) + require.NoError(t, err) + require.False(t, resized) + require.Equal(t, 0, getSize(t, path)) + }) + t.Run("sufficient disk space, no ballast", func(t *testing.T) { + path, cleanup := setup(t, 0) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1024, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.True(t, resized) + require.Equal(t, 1024, getSize(t, path)) + }) + t.Run("truncates ballast if necessary", func(t *testing.T) { + path, cleanup := setup(t, 2048) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 1024, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.True(t, resized) + require.Equal(t, 1024, getSize(t, path)) + }) + t.Run("does nothing if ballast is correct size", func(t *testing.T) { + path, cleanup := setup(t, 4096) + defer cleanup() + resized, err := maybeEstablishBallast(vfs.Default, path, 4096, vfs.DiskUsage{ + AvailBytes: 500 << 20, /* 500 MiB */ + }) + require.NoError(t, err) + require.False(t, resized) + require.Equal(t, 4096, getSize(t, path)) + }) +} diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 2f86fc4132a1..9793f868b9a8 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -27,6 +27,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/cli/exit" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -389,6 +390,10 @@ func DefaultPebbleOptions() *pebble.Options { if diskHealthCheckInterval.Seconds() > maxSyncDurationDefault.Seconds() { diskHealthCheckInterval = maxSyncDurationDefault } + // If we encounter ENOSPC, exit with an informative exit code. + opts.FS = vfs.OnDiskFull(opts.FS, func() { + exit.WithCode(exit.DiskFull()) + }) // Instantiate a file system with disk health checking enabled. This FS wraps // vfs.Default, and can be wrapped for encryption-at-rest. opts.FS = vfs.WithDiskHealthChecks(vfs.Default, diskHealthCheckInterval, @@ -443,11 +448,14 @@ type EncryptionStatsHandler interface { type Pebble struct { db *pebble.DB - closed bool - path string - auxDir string - maxSize int64 - attrs roachpb.Attributes + closed bool + readOnly bool + path string + auxDir string + ballastPath string + ballastSize int64 + maxSize int64 + attrs roachpb.Attributes // settings must be non-nil if this Pebble instance will be used to write // intents. settings *cluster.Settings @@ -558,6 +566,7 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) { if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil { return nil, err } + ballastPath := base.EmergencyBallastFile(cfg.Opts.FS.PathJoin, cfg.Dir) fileRegistry, statsHandler, err := ResolveEncryptedEnvOptions(&cfg) if err != nil { @@ -577,9 +586,34 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) { ctx: logCtx, depth: 1, } + + // Establish the emergency ballast if we can. If there's not sufficient + // disk space, the ballast will be reestablished from Capacity when the + // store's capacity is queried periodically. + if !cfg.Opts.ReadOnly { + du, err := cfg.Opts.FS.GetDiskUsage(cfg.Dir) + // If the FS is an in-memory FS, GetDiskUsage returns + // vfs.ErrUnsupported and we skip ballast creation. + if err != nil && !errors.Is(err, vfs.ErrUnsupported) { + return nil, errors.Wrap(err, "retrieving disk usage") + } else if err == nil { + resized, err := maybeEstablishBallast(cfg.Opts.FS, ballastPath, cfg.BallastSize, du) + if err != nil { + return nil, errors.Wrap(err, "resizing ballast") + } + if resized { + cfg.Opts.Logger.Infof("resized ballast %s to size %s", + ballastPath, humanizeutil.IBytes(cfg.BallastSize)) + } + } + } + p := &Pebble{ + readOnly: cfg.Opts.ReadOnly, path: cfg.Dir, auxDir: auxDir, + ballastPath: ballastPath, + ballastSize: cfg.BallastSize, maxSize: cfg.MaxSize, attrs: cfg.Attrs, settings: cfg.Settings, @@ -951,7 +985,15 @@ func (p *Pebble) Attrs() roachpb.Attributes { // Capacity implements the Engine interface. func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { dir := p.path - if dir == "" { + if dir != "" { + var err error + // Eval directory if it is a symbolic links. + if dir, err = filepath.EvalSymlinks(dir); err != nil { + return roachpb.StoreCapacity{}, err + } + } + du, err := p.fs.GetDiskUsage(dir) + if errors.Is(err, vfs.ErrUnsupported) { // This is an in-memory instance. Pretend we're empty since we // don't know better and only use this for testing. Using any // part of the actual file system here can throw off allocator @@ -960,14 +1002,7 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { Capacity: p.maxSize, Available: p.maxSize, }, nil - } - var err error - // Eval directory if it is a symbolic links. - if dir, err = filepath.EvalSymlinks(dir); err != nil { - return roachpb.StoreCapacity{}, err - } - du, err := p.fs.GetDiskUsage(dir) - if err != nil { + } else if err != nil { return roachpb.StoreCapacity{}, err } @@ -982,6 +1017,25 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { fsuTotal := int64(du.TotalBytes) fsuAvail := int64(du.AvailBytes) + // If the emergency ballast isn't appropriately sized, try to resize it. + // This is a no-op if the ballast is already sized or if there's not + // enough available capacity to resize it. Capacity is called periodically + // by the kvserver, and that drives the automatic resizing of the ballast. + if !p.readOnly { + resized, err := maybeEstablishBallast(p.fs, p.ballastPath, p.ballastSize, du) + if err != nil { + return roachpb.StoreCapacity{}, errors.Wrap(err, "resizing ballast") + } + if resized { + p.logger.Infof("resized ballast %s to size %s", + p.ballastPath, humanizeutil.IBytes(p.ballastSize)) + du, err = p.fs.GetDiskUsage(dir) + if err != nil { + return roachpb.StoreCapacity{}, err + } + } + } + // Pebble has detailed accounting of its own disk space usage, and it's // incrementally updated which helps avoid O(# files) work here. m := p.db.Metrics() @@ -1005,6 +1059,12 @@ func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { } return err } + if path == p.ballastPath { + // Skip the ballast. Counting it as used is likely to confuse + // users, and it's more akin to space that is just unavailable + // like disk space often restricted to a root user. + return nil + } if info.Mode().IsRegular() { totalUsedBytes += info.Size() }