diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index ee0dacfff5..953f1818d6 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -180,7 +180,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, createFlags.StringVar( &cf.HealthInterval, healthIntervalFlagName, define.DefaultHealthCheckInterval, - "set an interval for the healthchecks (a value of disable results in no automatic timer setup)", + "set an interval for the healthcheck (a value of disable results in no automatic timer setup)", ) _ = cmd.RegisterFlagCompletionFunc(healthIntervalFlagName, completion.AutocompleteNone) @@ -428,6 +428,46 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, ) _ = cmd.RegisterFlagCompletionFunc(secretFlagName, AutocompleteSecrets) + startupHCCmdFlagName := "health-startup-cmd" + createFlags.StringVar( + &cf.StartupHCCmd, + startupHCCmdFlagName, "", + "Set a startup healthcheck command for the container", + ) + _ = cmd.RegisterFlagCompletionFunc(startupHCCmdFlagName, completion.AutocompleteNone) + + startupHCIntervalFlagName := "health-startup-interval" + createFlags.StringVar( + &cf.StartupHCInterval, + startupHCIntervalFlagName, define.DefaultHealthCheckInterval, + "Set an interval for the startup healthcheck", + ) + _ = cmd.RegisterFlagCompletionFunc(startupHCIntervalFlagName, completion.AutocompleteNone) + + startupHCRetriesFlagName := "health-startup-retries" + createFlags.UintVar( + &cf.StartupHCRetries, + startupHCRetriesFlagName, 0, + "Set the maximum number of retries before the startup healthcheck will restart the container", + ) + _ = cmd.RegisterFlagCompletionFunc(startupHCRetriesFlagName, completion.AutocompleteNone) + + startupHCSuccessesFlagName := "health-startup-success" + createFlags.UintVar( + &cf.StartupHCSuccesses, + startupHCSuccessesFlagName, 0, + "Set the number of consecutive successes before the startup healthcheck is marked as successful and the normal healthcheck begins (0 indicates any success will start the regular healthcheck)", + ) + _ = cmd.RegisterFlagCompletionFunc(startupHCSuccessesFlagName, completion.AutocompleteNone) + + startupHCTimeoutFlagName := "health-startup-timeout" + createFlags.StringVar( + &cf.StartupHCTimeout, + startupHCTimeoutFlagName, define.DefaultHealthCheckTimeout, + "Set the maximum amount of time that the startup healthcheck may take before it is considered failed", + ) + _ = cmd.RegisterFlagCompletionFunc(startupHCTimeoutFlagName, completion.AutocompleteNone) + stopSignalFlagName := "stop-signal" createFlags.StringVar( &cf.StopSignal, diff --git a/cmd/podman/healthcheck/run.go b/cmd/podman/healthcheck/run.go index 620e29baa4..6a94d9183e 100644 --- a/cmd/podman/healthcheck/run.go +++ b/cmd/podman/healthcheck/run.go @@ -35,7 +35,7 @@ func run(cmd *cobra.Command, args []string) error { if err != nil { return err } - if response.Status == define.HealthCheckUnhealthy { + if response.Status == define.HealthCheckUnhealthy || response.Status == define.HealthCheckStarting { registry.SetExitCode(1) fmt.Println(response.Status) } diff --git a/docs/source/markdown/options/health-startup-cmd.md b/docs/source/markdown/options/health-startup-cmd.md new file mode 100644 index 0000000000..cd54590a95 --- /dev/null +++ b/docs/source/markdown/options/health-startup-cmd.md @@ -0,0 +1,11 @@ +####> This option file is used in: +####> podman create, run +####> If you edit this file, make sure your changes +####> are applicable to all of those. +#### **--health-startup-cmd**=*"command"* | *'["command", "arg1", ...]'* + +Set a startup healthcheck command for a container. This command will be executed inside the container and is used to gate the regular +healthcheck. When the startup command succeeds, the regular healthcheck will begin and the startup healthcheck will cease. Optionally, +if the command fails for a set number of attempts, the container will be restarted. A startup healthcheck can be used to ensure that +containers with an extended startup period are not marked as unhealthy until they are fully started. Startup healthchecks can only be +used when a regular healthcheck (from the container's image or the **--health-cmd** option) is also set. diff --git a/docs/source/markdown/options/health-startup-interval.md b/docs/source/markdown/options/health-startup-interval.md new file mode 100644 index 0000000000..4681a7590b --- /dev/null +++ b/docs/source/markdown/options/health-startup-interval.md @@ -0,0 +1,7 @@ +####> This option file is used in: +####> podman create, run +####> If you edit this file, make sure your changes +####> are applicable to all of those. +#### **--health-startup-interval**=*interval* + +Set an interval for the startup healthcheck. An _interval_ of **disable** results in no automatic timer setup. The default is **30s**. diff --git a/docs/source/markdown/options/health-startup-retries.md b/docs/source/markdown/options/health-startup-retries.md new file mode 100644 index 0000000000..55b37eab60 --- /dev/null +++ b/docs/source/markdown/options/health-startup-retries.md @@ -0,0 +1,8 @@ +####> This option file is used in: +####> podman create, run +####> If you edit this file, make sure your changes +####> are applicable to all of those. +#### **--health-startup-retries**=*retries* + +The number of attempts allowed before the startup healthcheck restarts the container. If set to **0**, the container will never be +restarted. The default is **0**. diff --git a/docs/source/markdown/options/health-startup-success.md b/docs/source/markdown/options/health-startup-success.md new file mode 100644 index 0000000000..ddb48061f7 --- /dev/null +++ b/docs/source/markdown/options/health-startup-success.md @@ -0,0 +1,8 @@ +####> This option file is used in: +####> podman create, run +####> If you edit this file, make sure your changes +####> are applicable to all of those. +#### **--health-startup-success**=*retries* + +The number of successful runs required before the startup healthcheck will succeed and the regular healthcheck will begin. A value +of **0** means that any success will begin the regular healthcheck. The default is **0**. diff --git a/docs/source/markdown/options/health-startup-timeout.md b/docs/source/markdown/options/health-startup-timeout.md new file mode 100644 index 0000000000..11ba5f6676 --- /dev/null +++ b/docs/source/markdown/options/health-startup-timeout.md @@ -0,0 +1,8 @@ +####> This option file is used in: +####> podman create, run +####> If you edit this file, make sure your changes +####> are applicable to all of those. +#### **--health-startup-timeout**=*timeout* + +The maximum time a startup healthcheck command has to complete before it is marked as failed. The value can be expressed in a time +format like **2m3s**. The default value is **30s**. diff --git a/docs/source/markdown/podman-create.1.md.in b/docs/source/markdown/podman-create.1.md.in index 55970a8189..8e30c65df7 100644 --- a/docs/source/markdown/podman-create.1.md.in +++ b/docs/source/markdown/podman-create.1.md.in @@ -172,6 +172,16 @@ See [**Environment**](#environment) note below for precedence and examples. @@option health-start-period +@@option health-startup-cmd + +@@option health-startup-interval + +@@option health-startup-retries + +@@option health-startup-success + +@@option health-startup-timeout + @@option health-timeout #### **--help** diff --git a/docs/source/markdown/podman-run.1.md.in b/docs/source/markdown/podman-run.1.md.in index eb2435e08f..e23e88af32 100644 --- a/docs/source/markdown/podman-run.1.md.in +++ b/docs/source/markdown/podman-run.1.md.in @@ -204,6 +204,16 @@ See [**Environment**](#environment) note below for precedence and examples. @@option health-start-period +@@option health-startup-cmd + +@@option health-startup-interval + +@@option health-startup-retries + +@@option health-startup-success + +@@option health-startup-timeout + @@option health-timeout #### **--help** diff --git a/libpod/container.go b/libpod/container.go index 1e38bc87a5..a3187056a4 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -200,6 +200,18 @@ type ContainerState struct { // (only by restart policy). RestartCount uint `json:"restartCount,omitempty"` + // StartupHCPassed indicates that the startup healthcheck has + // succeeded and the main healthcheck can begin. + StartupHCPassed bool `json:"startupHCPassed,omitempty"` + // StartupHCSuccessCount indicates the number of successes of the + // startup healthcheck. A startup HC can require more than one success + // to be marked as passed. + StartupHCSuccessCount int `json:"startupHCSuccessCount,omitempty"` + // StartupHCFailureCount indicates the number of failures of the startup + // healthcheck. The container will be restarted if this exceed a set + // number in the startup HC config. + StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"` + // ExtensionStageHooks holds hooks which will be executed by libpod // and not delegated to the OCI runtime. ExtensionStageHooks map[string][]spec.Hook `json:"extensionStageHooks,omitempty"` @@ -929,6 +941,20 @@ func (c *Container) StoppedByUser() (bool, error) { return c.state.StoppedByUser, nil } +// StartupHCPassed returns whether the container's startup healthcheck passed. +func (c *Container) StartupHCPassed() (bool, error) { + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + return false, err + } + } + + return c.state.StartupHCPassed, nil +} + // Misc Accessors // Most will require locking diff --git a/libpod/container_config.go b/libpod/container_config.go index f3585d22c6..833113cff9 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -395,6 +395,10 @@ type ContainerMiscConfig struct { HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` // HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` + // StartupHealthCheckConfig is the configuration of the startup + // healthcheck for the container. This will run before the regular HC + // runs, and when it passes the regular HC will be activated. + StartupHealthCheckConfig *define.StartupHealthCheck `json:"startupHealthCheck,omitempty"` // PreserveFDs is a number of additional file descriptors (in addition // to 0, 1, 2) that will be passed to the executed process. The total FDs // passed will be 3 + PreserveFDs. diff --git a/libpod/container_internal.go b/libpod/container_internal.go index c168144426..9926bdfa27 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -622,6 +622,9 @@ func resetState(state *ContainerState) { state.CheckpointPath = "" state.CheckpointLog = "" state.RestoreLog = "" + state.StartupHCPassed = false + state.StartupHCSuccessCount = 0 + state.StartupHCFailureCount = 0 } // Refresh refreshes the container's state after a restart. @@ -1072,6 +1075,9 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { c.state.State = define.ContainerStateCreated c.state.StoppedByUser = false c.state.RestartPolicyMatch = false + c.state.StartupHCFailureCount = 0 + c.state.StartupHCSuccessCount = 0 + c.state.StartupHCPassed = false if !retainRetries { c.state.RestartCount = 0 @@ -1091,7 +1097,11 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { } if c.config.HealthCheckConfig != nil { - if err := c.createTimer(); err != nil { + timer := c.config.HealthCheckConfig.Interval.String() + if c.config.StartupHealthCheckConfig != nil { + timer = c.config.StartupHealthCheckConfig.Interval.String() + } + if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil { logrus.Error(err) } } @@ -1244,7 +1254,7 @@ func (c *Container) start() error { if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { logrus.Error(err) } - if err := c.startTimer(); err != nil { + if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil { logrus.Error(err) } } @@ -1422,7 +1432,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr return err } if c.config.HealthCheckConfig != nil { - if err := c.removeTransientFiles(context.Background()); err != nil { + if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { logrus.Error(err.Error()) } } @@ -1859,7 +1869,7 @@ func (c *Container) cleanup(ctx context.Context) error { // Remove healthcheck unit/timer file if it execs if c.config.HealthCheckConfig != nil { - if err := c.removeTransientFiles(ctx); err != nil { + if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err) } } diff --git a/libpod/container_validate.go b/libpod/container_validate.go index 25b0eee0ff..22f89f2203 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -156,6 +156,11 @@ func (c *Container) validate() error { } } + // Cannot set startup HC without a healthcheck + if c.config.HealthCheckConfig == nil && c.config.StartupHealthCheckConfig != nil { + return fmt.Errorf("cannot set a startup healthcheck when there is no regular healthcheck: %w", define.ErrInvalidArg) + } + return nil } diff --git a/libpod/define/healthchecks.go b/libpod/define/healthchecks.go index 274e025611..15ea79fc20 100644 --- a/libpod/define/healthchecks.go +++ b/libpod/define/healthchecks.go @@ -3,6 +3,8 @@ package define import ( "fmt" "strings" + + "github.com/containers/image/v5/manifest" ) const ( @@ -38,6 +40,9 @@ const ( HealthCheckInternalError HealthCheckStatus = iota // HealthCheckDefined means the healthcheck was found on the container HealthCheckDefined HealthCheckStatus = iota + // HealthCheckStartup means the healthcheck was unhealthy, but is still + // either within the startup HC or the startup period of the healthcheck + HealthCheckStartup HealthCheckStatus = iota ) // Healthcheck defaults. These are used both in the cli as well in @@ -131,3 +136,12 @@ func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, erro return HealthCheckOnFailureActionInvalid, err } } + +// StartupHealthCheck is the configuration of a startup healthcheck. +type StartupHealthCheck struct { + manifest.Schema2HealthConfig + // Successes are the number of successes required to mark the startup HC + // as passed. + // If set to 0, a single success will mark the HC as passed. + Successes int `json:",omitempty"` +} diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 1a81360f41..165520ac49 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -25,7 +25,7 @@ const ( // HealthCheck verifies the state and validity of the healthcheck configuration // on the container and then executes the healthcheck -func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { +func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) { container, err := r.LookupContainer(name) if err != nil { return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) @@ -36,21 +36,35 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { return hcStatus, err } - hcStatus, logStatus, err := container.runHealthCheck() - if err := container.processHealthCheckStatus(logStatus); err != nil { - return hcStatus, err + isStartupHC := false + if container.config.StartupHealthCheckConfig != nil { + passed, err := container.StartupHCPassed() + if err != nil { + return define.HealthCheckInternalError, err + } + isStartupHC = !passed + } + + hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC) + if !isStartupHC { + if err := container.processHealthCheckStatus(logStatus); err != nil { + return hcStatus, err + } } return hcStatus, err } -// runHealthCheck runs the health check as defined by the container -func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) { +func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) { var ( newCommand []string returnCode int inStartPeriod bool ) hcCommand := c.HealthCheckConfig().Test + if isStartup { + logrus.Debugf("Running startup healthcheck for container %s", c.ID()) + hcCommand = c.config.StartupHealthCheckConfig.Test + } if len(hcCommand) < 1 { return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) } @@ -113,6 +127,18 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) { hcResult = define.HealthCheckFailure returnCode = 1 } + + // Handle startup HC + if isStartup { + inStartPeriod = true + if hcErr != nil || exitCode != 0 { + hcResult = define.HealthCheckStartup + c.incrementStartupHCFailureCounter(ctx) + } else { + c.incrementStartupHCSuccessCounter(ctx) + } + } + timeEnd := time.Now() if c.HealthCheckConfig().StartPeriod > 0 { // there is a start-period we need to honor; we add startPeriod to container start time @@ -188,6 +214,114 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { return define.HealthCheckDefined, nil } +// Increment the current startup healthcheck success counter. +// Can stop the startup HC and start the regular HC if the startup HC has enough +// consecutive successes. +func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + logrus.Errorf("Error syncing container %s state: %v", c.ID(), err) + return + } + } + + // We don't have a startup HC, can't do anything + if c.config.StartupHealthCheckConfig == nil { + return + } + + // Race: someone else got here first + if c.state.StartupHCPassed { + return + } + + // Increment the success counter + c.state.StartupHCSuccessCount++ + + logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount) + + // Did we exceed threshold? + recreateTimer := false + if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes { + c.state.StartupHCPassed = true + c.state.StartupHCSuccessCount = 0 + c.state.StartupHCFailureCount = 0 + + recreateTimer = true + } + + if err := c.save(); err != nil { + logrus.Errorf("Error saving container %s state: %v", c.ID(), err) + return + } + + if recreateTimer { + logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) + + // Create the new, standard healthcheck timer first. + if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil { + logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err) + return + } + if err := c.startTimer(false); err != nil { + logrus.Errorf("Error restarting container %s healthcheck timer: %v", c.ID(), err) + } + + // This kills the process the healthcheck is running. + // Which happens to be us. + // So this has to be last - after this, systemd serves us a + // SIGTERM and we exit. + if err := c.removeTransientFiles(ctx, true); err != nil { + logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err) + return + } + } +} + +// Increment the current startup healthcheck failure counter. +// Can restart the container if the HC fails enough times consecutively. +func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) { + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + logrus.Errorf("Error syncing container %s state: %v", c.ID(), err) + return + } + } + + // We don't have a startup HC, can't do anything + if c.config.StartupHealthCheckConfig == nil { + return + } + + // Race: someone else got here first + if c.state.StartupHCPassed { + return + } + + c.state.StartupHCFailureCount++ + + logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount) + + if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries { + logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID()) + // Restart the container + if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil { + logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err) + } + return + } + + if err := c.save(); err != nil { + logrus.Errorf("Error saving container %s state: %v", c.ID(), err) + } +} + func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { return define.HealthCheckLog{ Start: start.Format(time.RFC3339Nano), @@ -299,12 +433,26 @@ func (c *Container) healthCheckStatus() (string, error) { return results.Status, nil } -func (c *Container) disableHealthCheckSystemd() bool { +func (c *Container) disableHealthCheckSystemd(isStartup bool) bool { if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { return true } + if isStartup { + if c.config.StartupHealthCheckConfig.Interval == 0 { + return true + } + } if c.config.HealthCheckConfig.Interval == 0 { return true } return false } + +// Systemd unit name for the healthcheck systemd unit +func (c *Container) hcUnitName(isStartup bool) string { + unitName := c.ID() + if isStartup { + unitName += "-startup" + } + return unitName +} diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 6948f976a8..5dda104107 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -14,8 +14,8 @@ import ( ) // createTimer systemd timers for healthchecks of a container -func (c *Container) createTimer() error { - if c.disableHealthCheckSystemd() { +func (c *Container) createTimer(interval string, isStartup bool) error { + if c.disableHealthCheckSystemd(isStartup) { return nil } podman, err := os.Executable() @@ -31,7 +31,14 @@ func (c *Container) createTimer() error { if path != "" { cmd = append(cmd, "--setenv=PATH="+path) } - cmd = append(cmd, "--unit", c.ID(), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()) + + cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman) + + if logrus.IsLevelEnabled(logrus.DebugLevel) { + cmd = append(cmd, "--log-level=debug", "--syslog") + } + + cmd = append(cmd, "healthcheck", "run", c.ID()) conn, err := systemd.ConnectToDBUS() if err != nil { @@ -58,8 +65,8 @@ func systemdOpSuccessful(c chan string) error { } // startTimer starts a systemd timer for the healthchecks -func (c *Container) startTimer() error { - if c.disableHealthCheckSystemd() { +func (c *Container) startTimer(isStartup bool) error { + if c.disableHealthCheckSystemd(isStartup) { return nil } conn, err := systemd.ConnectToDBUS() @@ -68,7 +75,7 @@ func (c *Container) startTimer() error { } defer conn.Close() - startFile := fmt.Sprintf("%s.service", c.ID()) + startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) startChan := make(chan string) if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { return err @@ -82,8 +89,8 @@ func (c *Container) startTimer() error { // removeTransientFiles removes the systemd timer and unit files // for the container -func (c *Container) removeTransientFiles(ctx context.Context) error { - if c.disableHealthCheckSystemd() { +func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error { + if c.disableHealthCheckSystemd(isStartup) { return nil } conn, err := systemd.ConnectToDBUS() @@ -99,7 +106,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error { // Stop the timer before the service to make sure the timer does not // fire after the service is stopped. timerChan := make(chan string) - timerFile := fmt.Sprintf("%s.timer", c.ID()) + timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup)) if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil { if !strings.HasSuffix(err.Error(), ".timer not loaded.") { stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err)) @@ -111,7 +118,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error { // Reset the service before stopping it to make sure it's being removed // on stop. serviceChan := make(chan string) - serviceFile := fmt.Sprintf("%s.service", c.ID()) + serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil { logrus.Debugf("Failed to reset unit file: %q", err) } diff --git a/libpod/options.go b/libpod/options.go index 8ea527632f..94da2158d6 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1898,6 +1898,21 @@ func WithInfraConfig(compatibleOptions InfraInherit) CtrCreateOption { } } +// WithStartupHealthcheck sets a startup healthcheck for the container. +// Requires that a healthcheck must be set. +func WithStartupHealthcheck(startupHC *define.StartupHealthCheck) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + ctr.config.StartupHealthCheckConfig = new(define.StartupHealthCheck) + if err := JSONDeepCopy(startupHC, ctr.config.StartupHealthCheckConfig); err != nil { + return fmt.Errorf("error copying startup healthcheck into container: %w", err) + } + return nil + } +} + // Pod Creation Options // WithPodCreateCommand adds the full command plus arguments of the current diff --git a/pkg/api/handlers/libpod/healthcheck.go b/pkg/api/handlers/libpod/healthcheck.go index bff092d214..42df39c0c0 100644 --- a/pkg/api/handlers/libpod/healthcheck.go +++ b/pkg/api/handlers/libpod/healthcheck.go @@ -12,7 +12,7 @@ import ( func RunHealthCheck(w http.ResponseWriter, r *http.Request) { runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime) name := utils.GetName(r) - status, err := runtime.HealthCheck(name) + status, err := runtime.HealthCheck(r.Context(), name) if err != nil { if status == define.HealthCheckContainerNotFound { utils.ContainerNotFound(w, name, err) @@ -32,6 +32,8 @@ func RunHealthCheck(w http.ResponseWriter, r *http.Request) { hcStatus := define.HealthCheckUnhealthy if status == define.HealthCheckSuccess { hcStatus = define.HealthCheckHealthy + } else if status == define.HealthCheckStartup { + hcStatus = define.HealthCheckStarting } report := define.HealthCheckResults{ Status: hcStatus, diff --git a/pkg/domain/entities/pods.go b/pkg/domain/entities/pods.go index b02bbc86ca..a70e3c8dca 100644 --- a/pkg/domain/entities/pods.go +++ b/pkg/domain/entities/pods.go @@ -174,125 +174,129 @@ const ( ) type ContainerCreateOptions struct { - Annotation []string - Attach []string - Authfile string - BlkIOWeight string - BlkIOWeightDevice []string - CapAdd []string - CapDrop []string - CgroupNS string - CgroupsMode string - CgroupParent string `json:"cgroup_parent,omitempty"` - CIDFile string - ConmonPIDFile string `json:"container_conmon_pidfile,omitempty"` - CPUPeriod uint64 - CPUQuota int64 - CPURTPeriod uint64 - CPURTRuntime int64 - CPUShares uint64 - CPUS float64 `json:"cpus,omitempty"` - CPUSetCPUs string `json:"cpuset_cpus,omitempty"` - CPUSetMems string - Devices []string `json:"devices,omitempty"` - DeviceCgroupRule []string - DeviceReadBPs []string `json:"device_read_bps,omitempty"` - DeviceReadIOPs []string - DeviceWriteBPs []string - DeviceWriteIOPs []string - Entrypoint *string `json:"container_command,omitempty"` - Env []string - EnvHost bool - EnvFile []string - Expose []string - GIDMap []string - GroupAdd []string - HealthCmd string - HealthInterval string - HealthRetries uint - HealthStartPeriod string - HealthTimeout string - HealthOnFailure string - Hostname string `json:"hostname,omitempty"` - HTTPProxy bool - HostUsers []string - ImageVolume string - Init bool - InitContainerType string - InitPath string - Interactive bool - IPC string - Label []string - LabelFile []string - LogDriver string - LogOptions []string - Memory string - MemoryReservation string - MemorySwap string - MemorySwappiness int64 - Name string `json:"container_name"` - NoHealthCheck bool - OOMKillDisable bool - OOMScoreAdj *int - Arch string - OS string - Variant string - PID string `json:"pid,omitempty"` - PIDsLimit *int64 - Platform string - Pod string - PodIDFile string - Personality string - PreserveFDs uint - Privileged bool - PublishAll bool - Pull string - Quiet bool - ReadOnly bool - ReadOnlyTmpFS bool - Restart string - Replace bool - Requires []string - Rm bool - RootFS bool - Secrets []string - SecurityOpt []string `json:"security_opt,omitempty"` - SdNotifyMode string - ShmSize string - SignaturePolicy string - StopSignal string - StopTimeout uint - StorageOpts []string - SubUIDName string - SubGIDName string - Sysctl []string `json:"sysctl,omitempty"` - Systemd string - Timeout uint - TLSVerify commonFlag.OptionalBool - TmpFS []string - TTY bool - Timezone string - Umask string - EnvMerge []string - UnsetEnv []string - UnsetEnvAll bool - UIDMap []string - Ulimit []string - User string - UserNS string `json:"-"` - UTS string - Mount []string - Volume []string `json:"volume,omitempty"` - VolumesFrom []string `json:"volumes_from,omitempty"` - Workdir string - SeccompPolicy string - PidFile string - ChrootDirs []string - IsInfra bool - IsClone bool - DecryptionKeys []string - - Net *NetOptions `json:"net,omitempty"` + Annotation []string + Attach []string + Authfile string + BlkIOWeight string + BlkIOWeightDevice []string + CapAdd []string + CapDrop []string + CgroupNS string + CgroupsMode string + CgroupParent string `json:"cgroup_parent,omitempty"` + CIDFile string + ConmonPIDFile string `json:"container_conmon_pidfile,omitempty"` + CPUPeriod uint64 + CPUQuota int64 + CPURTPeriod uint64 + CPURTRuntime int64 + CPUShares uint64 + CPUS float64 `json:"cpus,omitempty"` + CPUSetCPUs string `json:"cpuset_cpus,omitempty"` + CPUSetMems string + Devices []string `json:"devices,omitempty"` + DeviceCgroupRule []string + DeviceReadBPs []string `json:"device_read_bps,omitempty"` + DeviceReadIOPs []string + DeviceWriteBPs []string + DeviceWriteIOPs []string + Entrypoint *string `json:"container_command,omitempty"` + Env []string + EnvHost bool + EnvFile []string + Expose []string + GIDMap []string + GroupAdd []string + HealthCmd string + HealthInterval string + HealthRetries uint + HealthStartPeriod string + HealthTimeout string + HealthOnFailure string + Hostname string `json:"hostname,omitempty"` + HTTPProxy bool + HostUsers []string + ImageVolume string + Init bool + InitContainerType string + InitPath string + Interactive bool + IPC string + Label []string + LabelFile []string + LogDriver string + LogOptions []string + Memory string + MemoryReservation string + MemorySwap string + MemorySwappiness int64 + Name string `json:"container_name"` + NoHealthCheck bool + OOMKillDisable bool + OOMScoreAdj *int + Arch string + OS string + Variant string + PID string `json:"pid,omitempty"` + PIDsLimit *int64 + Platform string + Pod string + PodIDFile string + Personality string + PreserveFDs uint + Privileged bool + PublishAll bool + Pull string + Quiet bool + ReadOnly bool + ReadOnlyTmpFS bool + Restart string + Replace bool + Requires []string + Rm bool + RootFS bool + Secrets []string + SecurityOpt []string `json:"security_opt,omitempty"` + SdNotifyMode string + ShmSize string + SignaturePolicy string + StartupHCCmd string + StartupHCInterval string + StartupHCRetries uint + StartupHCSuccesses uint + StartupHCTimeout string + StopSignal string + StopTimeout uint + StorageOpts []string + SubUIDName string + SubGIDName string + Sysctl []string `json:"sysctl,omitempty"` + Systemd string + Timeout uint + TLSVerify commonFlag.OptionalBool + TmpFS []string + TTY bool + Timezone string + Umask string + EnvMerge []string + UnsetEnv []string + UnsetEnvAll bool + UIDMap []string + Ulimit []string + User string + UserNS string `json:"-"` + UTS string + Mount []string + Volume []string `json:"volume,omitempty"` + VolumesFrom []string `json:"volumes_from,omitempty"` + Workdir string + SeccompPolicy string + PidFile string + ChrootDirs []string + IsInfra bool + IsClone bool + DecryptionKeys []string + Net *NetOptions `json:"net,omitempty"` CgroupConf []string diff --git a/pkg/domain/infra/abi/healthcheck.go b/pkg/domain/infra/abi/healthcheck.go index c4d8cfb096..8761521ccb 100644 --- a/pkg/domain/infra/abi/healthcheck.go +++ b/pkg/domain/infra/abi/healthcheck.go @@ -8,13 +8,15 @@ import ( ) func (ic *ContainerEngine) HealthCheckRun(ctx context.Context, nameOrID string, options entities.HealthCheckOptions) (*define.HealthCheckResults, error) { - status, err := ic.Libpod.HealthCheck(nameOrID) + status, err := ic.Libpod.HealthCheck(ctx, nameOrID) if err != nil { return nil, err } hcStatus := define.HealthCheckUnhealthy if status == define.HealthCheckSuccess { hcStatus = define.HealthCheckHealthy + } else if status == define.HealthCheckStartup { + hcStatus = define.HealthCheckStarting } report := define.HealthCheckResults{ Status: hcStatus, diff --git a/pkg/specgen/generate/container_create.go b/pkg/specgen/generate/container_create.go index 35928a57fd..ea2fde1502 100644 --- a/pkg/specgen/generate/container_create.go +++ b/pkg/specgen/generate/container_create.go @@ -527,6 +527,9 @@ func createContainerOptions(rt *libpod.Runtime, s *specgen.SpecGenerator, pod *l options = append(options, libpod.WithHealthCheck(s.ContainerHealthCheckConfig.HealthConfig)) logrus.Debugf("New container has a health check") } + if s.ContainerHealthCheckConfig.StartupHealthConfig != nil { + options = append(options, libpod.WithStartupHealthcheck(s.ContainerHealthCheckConfig.StartupHealthConfig)) + } if s.ContainerHealthCheckConfig.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone { options = append(options, libpod.WithHealthCheckOnFailureAction(s.ContainerHealthCheckConfig.HealthCheckOnFailureAction)) diff --git a/pkg/specgen/specgen.go b/pkg/specgen/specgen.go index 50b3d6e47f..6daa4dc018 100644 --- a/pkg/specgen/specgen.go +++ b/pkg/specgen/specgen.go @@ -536,6 +536,10 @@ type ContainerResourceConfig struct { type ContainerHealthCheckConfig struct { HealthConfig *manifest.Schema2HealthConfig `json:"healthconfig,omitempty"` HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"health_check_on_failure_action,omitempty"` + // Startup healthcheck for a container. + // Requires that HealthConfig be set. + // Optional. + StartupHealthConfig *define.StartupHealthCheck `json:"startupHealthConfig,omitempty"` } // SpecGenerator creates an OCI spec and Libpod configuration options to create diff --git a/pkg/specgenutil/specgen.go b/pkg/specgenutil/specgen.go index 2695d88732..83c9ddd0f0 100644 --- a/pkg/specgenutil/specgen.go +++ b/pkg/specgenutil/specgen.go @@ -256,7 +256,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions if c.NoHealthCheck { return errors.New("cannot specify both --no-healthcheck and --health-cmd") } - s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod) + s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod, false) if err != nil { return err } @@ -272,6 +272,25 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions } s.HealthCheckOnFailureAction = onFailureAction + if c.StartupHCCmd != "" { + if c.NoHealthCheck { + return errors.New("cannot specify both --no-healthcheck and --health-startup-cmd") + } + // The hardcoded "1s" will be discarded, as the startup + // healthcheck does not have a period. So just hardcode + // something that parses correctly. + tmpHcConfig, err := makeHealthCheckFromCli(c.StartupHCCmd, c.StartupHCInterval, c.StartupHCRetries, c.StartupHCTimeout, "1s", true) + if err != nil { + return err + } + s.StartupHealthConfig = new(define.StartupHealthCheck) + s.StartupHealthConfig.Test = tmpHcConfig.Test + s.StartupHealthConfig.Interval = tmpHcConfig.Interval + s.StartupHealthConfig.Timeout = tmpHcConfig.Timeout + s.StartupHealthConfig.Retries = tmpHcConfig.Retries + s.StartupHealthConfig.Successes = int(c.StartupHCSuccesses) + } + if err := setNamespaces(s, c); err != nil { return err } @@ -838,7 +857,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions return nil } -func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string) (*manifest.Schema2HealthConfig, error) { +func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string, isStartup bool) (*manifest.Schema2HealthConfig, error) { cmdArr := []string{} isArr := true err := json.Unmarshal([]byte(inCmd), &cmdArr) // array unmarshalling @@ -886,7 +905,7 @@ func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, start hc.Interval = intervalDuration - if retries < 1 { + if retries < 1 && !isStartup { return nil, errors.New("healthcheck-retries must be greater than 0") } hc.Retries = int(retries) diff --git a/test/e2e/healthcheck_run_test.go b/test/e2e/healthcheck_run_test.go index ace52a5156..c1afe00de8 100644 --- a/test/e2e/healthcheck_run_test.go +++ b/test/e2e/healthcheck_run_test.go @@ -334,4 +334,43 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE) // Check to make sure characters were not coerced to utf8 Expect(inspect[0].Config.Healthcheck).To(HaveField("Test", []string{"CMD-SHELL", "ls -l / 2>&1"})) }) + + It("Startup healthcheck success transitions to regular healthcheck", func() { + ctrName := "hcCtr" + ctrRun := podmanTest.Podman([]string{"run", "-dt", "--name", ctrName, "--health-cmd", "echo regular", "--health-startup-cmd", "cat /test", ALPINE, "top"}) + ctrRun.WaitWithDefaultTimeout() + Expect(ctrRun).Should(Exit(0)) + + inspect := podmanTest.InspectContainer(ctrName) + Expect(inspect[0].State.Health).To(HaveField("Status", "starting")) + + hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName}) + hc.WaitWithDefaultTimeout() + Expect(hc).Should(Exit(1)) + + exec := podmanTest.Podman([]string{"exec", ctrName, "sh", "-c", "touch /test && echo startup > /test"}) + exec.WaitWithDefaultTimeout() + Expect(exec).Should(Exit(0)) + + hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName}) + hc.WaitWithDefaultTimeout() + Expect(hc).Should(Exit(0)) + + inspect = podmanTest.InspectContainer(ctrName) + Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy)) + + hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName}) + hc.WaitWithDefaultTimeout() + Expect(hc).Should(Exit(0)) + + inspect = podmanTest.InspectContainer(ctrName) + Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy)) + + // Test podman ps --filter heath is working (#11687) + ps := podmanTest.Podman([]string{"ps", "--filter", "health=healthy"}) + ps.WaitWithDefaultTimeout() + Expect(ps).Should(Exit(0)) + Expect(ps.OutputToStringArray()).To(HaveLen(2)) + Expect(ps.OutputToString()).To(ContainSubstring("hc")) + }) })