From f148fa55c6179e12b1879febf1d5169da5baf69c Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Tue, 18 Oct 2022 19:30:56 +0200 Subject: [PATCH] cli/mt_start_sql: share code between server startups Prior to this patch, the code in `runStartSQL` was using a different sequence of steps than in `runStart`, even for those steps that are beneficial to run in both cases. This commit fixes that. In particular it adds the following missing bits to `cockroach mt start-sql`: - it fixes support for the (test-only) `COCKROACH_EXPERIMENTAL_LINEARIZABLE` env var. (from #4754) - it adds a tracing span for the startup code. (from #8712) - it properly supports `--listening-url-file`. (from #15468) - it properly does sanitization of `--external-io-dir`. (from #19725) - it sets the proper log severity level for gRPC. (from #20308) - it reports the command-line and env config to logs. (from #21344) - it prevents startup if there is a `_CRITICAL_ALERT.txt` file in the store directory. (from #42401) - sets the umask for newly created file to remove "other" permission bits. This was a security team request originally. (from #44043) - it sets `GOMAXPROCS` from current cgroup limits. (from #57390) - it stops the server early if the storage disk is full. (from #66893) - it fixes support for the `COCKROACH_DISABLE_SPAN_CONFIGS` config env var. (from #73876) To review this commit, it is useful to open the files `start.go` and `mt_start_sql.go` side-by-side, which clarifies how much closer they have become to each other. Looking at the `mt_start_sql.go` changes without that context likely makes the review more difficult. A later commit will actually merge the two code paths together so there is just one thing to maintain. Release note: None --- pkg/cli/mt_start_sql.go | 148 +++++++++++++++++++++++++++++++++++++++- pkg/cli/start.go | 2 + pkg/server/config.go | 19 +++++- 3 files changed, 164 insertions(+), 5 deletions(-) diff --git a/pkg/cli/mt_start_sql.go b/pkg/cli/mt_start_sql.go index 8e7642b3e362..e2e1b31968bb 100644 --- a/pkg/cli/mt_start_sql.go +++ b/pkg/cli/mt_start_sql.go @@ -13,16 +13,27 @@ package cli import ( "context" "fmt" + "net/url" "os" "os/signal" + "github.com/cockroachdb/cockroach/pkg/cli/clierror" "github.com/cockroachdb/cockroach/pkg/cli/clierrorplus" + "github.com/cockroachdb/cockroach/pkg/cli/exit" "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/security/clientsecopts" + "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/cgroups" + "github.com/cockroachdb/cockroach/pkg/util/grpcutil" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/log/severity" "github.com/cockroachdb/cockroach/pkg/util/sdnotify" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/pebble/vfs" "github.com/spf13/cobra" ) @@ -73,9 +84,39 @@ func runStartSQL(cmd *cobra.Command, args []string) error { return err } + // Change the permission mask for all created files. + // + // We're considering everything produced by a cockroach node + // to potentially contain sensitive information, so it should + // not be world-readable. + disableOtherPermissionBits() + + // Set up the signal handlers. This also ensures that any of these + // signals received beyond this point do not interrupt the startup + // sequence until the point signals are checked below. + // We want to set up signal handling before starting logging, because + // logging uses buffering, and we want to be able to sync + // the buffers in the signal handler below. If we started capturing + // signals later, some startup logging might be lost. signalCh := make(chan os.Signal, 1) signal.Notify(signalCh, drainSignals...) + // Check for stores with full disks and exit with an informative exit + // code. This needs to happen early during start, before we perform any + // writes to the filesystem including log rotation. We need to guarantee + // that the process continues to exit with the Disk Full exit code. A + // flapping exit code can affect alerting, including the alerting + // performed within CockroachCloud. + if err := exitIfDiskFull(vfs.Default, serverCfg.Stores.Specs); err != nil { + return err + } + + // If any store has something to say against a server start-up + // (e.g. previously detected corruption), listen to them now. + if err := serverCfg.Stores.PriorCriticalAlertError(); err != nil { + return clierror.NewError(err, exit.FatalError()) + } + // Set up a cancellable context for the entire start command. // The context will be canceled at the end. ctx, cancel := context.WithCancel(context.Background()) @@ -84,17 +125,62 @@ func runStartSQL(cmd *cobra.Command, args []string) error { // The context annotation ensures that server identifiers show up // in the logging metadata as soon as they are known. ambientCtx := serverCfg.AmbientCtx - ctx = ambientCtx.AnnotateCtx(ctx) + // Annotate the context, and set up a tracing span for the start process. + // + // The context annotation ensures that server identifiers show up + // in the logging metadata as soon as they are known. + // + // The tracing span is because we want any logging happening beyond + // this point to be accounted to this start context, including + // logging related to the initialization of the logging + // infrastructure below. This span concludes when the startup + // goroutine started below has completed. TODO(andrei): we don't + // close the span on the early returns below. + var startupSpan *tracing.Span + ctx, startupSpan = ambientCtx.AnnotateCtxWithSpan(ctx, "server start") + + // Set up the logging and profiling output. + // + // We want to do this as early as possible, because most of the code + // in CockroachDB may use logging, and until logging has been + // initialized log files will be created in $TMPDIR instead of their + // expected location. + // + // This initialization uses the various configuration parameters + // initialized by flag handling (before runStart was called). Any + // additional server configuration tweaks for the startup process + // must be necessarily non-logging-related, as logging parameters + // cannot be picked up beyond this point. stopper, err := setupAndInitializeLoggingAndProfiling(ctx, cmd, false /* isServerCmd */) if err != nil { return err } - defer stopper.Stop(ctx) + defer stopper.Stop(ctx) // TODO(knz): Move this. stopper.SetTracer(serverCfg.BaseConfig.AmbientCtx.Tracer) + // We don't care about GRPCs fairly verbose logs in most client commands, + // but when actually starting a server, we enable them. + grpcutil.LowerSeverity(severity.WARNING) + + // Tweak GOMAXPROCS if we're in a cgroup / container that has cpu limits set. + // The GO default for GOMAXPROCS is NumCPU(), however this is less + // than ideal if the cgroup is limited to a number lower than that. + // + // TODO(bilal): various global settings have already been initialized based on + // GOMAXPROCS(0) by now. + cgroups.AdjustMaxProcs(ctx) + + // Now perform additional configuration tweaks specific to the start + // command. + st := serverCfg.BaseConfig.Settings + // Derive temporary/auxiliary directory specifications. + if st.ExternalIODir, err = initExternalIODir(ctx, serverCfg.Stores.Specs[0]); err != nil { + return err + } + // This value is injected in order to have something populated during startup. // In the initial 20.2 release of multi-tenant clusters, no version state was // ever populated in the version cluster setting. A value is populated during @@ -117,8 +203,31 @@ func runStartSQL(cmd *cobra.Command, args []string) error { return err } + // Configure the default storage engine. + // NB: we only support one engine type for now. + if serverCfg.StorageEngine == enginepb.EngineTypeDefault { + serverCfg.StorageEngine = enginepb.EngineTypePebble + } + + // Initialize the node's configuration from startup parameters. + // This also reads the part of the configuration that comes from + // environment variables. + if err := serverCfg.InitSQLServer(ctx); err != nil { + return err + } + + // The configuration is now ready to report to the user and the log + // file. We had to wait after InitNode() so that all configuration + // environment variables, which are reported too, have been read and + // registered. + reportConfiguration(ctx) + initGEOS(ctx) + // Beyond this point, the configuration is set and the server is + // ready to start. + log.Ops.Info(ctx, "starting cockroach SQL node") + tenantServer, err := server.NewTenantServer( ctx, stopper, @@ -132,6 +241,11 @@ func runStartSQL(cmd *cobra.Command, args []string) error { return err } + // Inform the user if the network settings are suspicious. We need + // to do that after starting to listen because we need to know + // which advertise address NewServer() has decided. + hintServerCmdFlags(ctx, cmd) + // If another process was waiting on the PID (e.g. using a FIFO), // this is when we can tell them the node has started listening. if startCtx.pidFile != "" { @@ -141,11 +255,41 @@ func runStartSQL(cmd *cobra.Command, args []string) error { } } + // If the invoker has requested an URL update, do it now that + // the server is ready to accept SQL connections. + // (Note: as stated above, ReadyFn is called after the server + // has started listening on its socket, but possibly before + // the cluster has been initialized and can start processing requests. + // This is OK for SQL clients, as the connection will be accepted + // by the network listener and will just wait/suspend until + // the cluster initializes, at which point it will be picked up + // and let the client go through, transparently.) + if startCtx.listeningURLFile != "" { + log.Ops.Infof(ctx, "listening URL file: %s", startCtx.listeningURLFile) + // (Re-)compute the client connection URL. We cannot do this + // earlier (e.g. above, in the runStart function) because + // at this time the address and port have not been resolved yet. + clientConnOptions, serverParams := makeServerOptionsForURL(&serverCfg) + pgURL, err := clientsecopts.MakeURLForServer(clientConnOptions, serverParams, url.User(username.RootUser)) + if err != nil { + log.Errorf(ctx, "failed computing the URL: %v", err) + return err + } + + if err = os.WriteFile(startCtx.listeningURLFile, []byte(fmt.Sprintf("%s\n", pgURL.ToPQ())), 0644); err != nil { + log.Ops.Errorf(ctx, "failed writing the URL: %v", err) + } + } + // Ensure the configuration logging is written to disk in case a // process is waiting for the sdnotify readiness to read important // information from there. log.Flush() + // When the start up completes, so can the start up span + // defined above. + defer startupSpan.Finish() + // Signal readiness. This unblocks the process when running with // --background or under systemd. if err := sdnotify.Ready(); err != nil { diff --git a/pkg/cli/start.go b/pkg/cli/start.go index 1ff684cc605b..f45ff4e624b1 100644 --- a/pkg/cli/start.go +++ b/pkg/cli/start.go @@ -399,6 +399,8 @@ func runStart(cmd *cobra.Command, args []string, startSingleNode bool) (returnEr ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // The context annotation ensures that server identifiers show up + // in the logging metadata as soon as they are known. ambientCtx := serverCfg.AmbientCtx // Annotate the context, and set up a tracing span for the start process. diff --git a/pkg/server/config.go b/pkg/server/config.go index d24def70549d..b869b32075da 100644 --- a/pkg/server/config.go +++ b/pkg/server/config.go @@ -751,7 +751,16 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) { return enginesCopy, nil } -// InitNode parses node attributes and bootstrap addresses. +// InitSQLServer finalizes the configuration of a SQL-only node. +// It initializes additional configuration flags from the environment. +func (cfg *Config) InitSQLServer(ctx context.Context) error { + cfg.readSQLEnvironmentVariables() + return nil +} + +// InitNode finalizes the configuration of a KV node. +// It parses node attributes and bootstrap addresses and +// initializes additional configuration flags from the environment. func (cfg *Config) InitNode(ctx context.Context) error { cfg.readEnvironmentVariables() @@ -801,12 +810,16 @@ func (cfg *BaseConfig) RequireWebSession() bool { return !cfg.Insecure && cfg.EnableWebSessionAuthentication } +func (cfg *Config) readSQLEnvironmentVariables() { + cfg.SpanConfigsDisabled = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_SPAN_CONFIGS", cfg.SpanConfigsDisabled) + cfg.Linearizable = envutil.EnvOrDefaultBool("COCKROACH_EXPERIMENTAL_LINEARIZABLE", cfg.Linearizable) +} + // readEnvironmentVariables populates all context values that are environment // variable based. Note that this only happens when initializing a node and not // when NewContext is called. func (cfg *Config) readEnvironmentVariables() { - cfg.SpanConfigsDisabled = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_SPAN_CONFIGS", cfg.SpanConfigsDisabled) - cfg.Linearizable = envutil.EnvOrDefaultBool("COCKROACH_EXPERIMENTAL_LINEARIZABLE", cfg.Linearizable) + cfg.readSQLEnvironmentVariables() cfg.ScanInterval = envutil.EnvOrDefaultDuration("COCKROACH_SCAN_INTERVAL", cfg.ScanInterval) cfg.ScanMinIdleTime = envutil.EnvOrDefaultDuration("COCKROACH_SCAN_MIN_IDLE_TIME", cfg.ScanMinIdleTime) cfg.ScanMaxIdleTime = envutil.EnvOrDefaultDuration("COCKROACH_SCAN_MAX_IDLE_TIME", cfg.ScanMaxIdleTime)