Skip to content

Commit

Permalink
Add support for startup healthchecks
Browse files Browse the repository at this point in the history
Startup healthchecks are similar to K8S startup probes, in that
they are a separate check from the regular healthcheck that runs
before it. If the startup healthcheck fails repeatedly, the
associated container is restarted.

Signed-off-by: Matthew Heon <[email protected]>
  • Loading branch information
mheon committed Nov 28, 2022
1 parent 935c8eb commit d161293
Show file tree
Hide file tree
Showing 24 changed files with 551 additions and 147 deletions.
42 changes: 41 additions & 1 deletion cmd/podman/common/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar(
&cf.HealthInterval,
healthIntervalFlagName, define.DefaultHealthCheckInterval,
"set an interval for the healthchecks (a value of disable results in no automatic timer setup)",
"set an interval for the healthcheck (a value of disable results in no automatic timer setup)",
)
_ = cmd.RegisterFlagCompletionFunc(healthIntervalFlagName, completion.AutocompleteNone)

Expand Down Expand Up @@ -428,6 +428,46 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(secretFlagName, AutocompleteSecrets)

startupHCCmdFlagName := "health-startup-cmd"
createFlags.StringVar(
&cf.StartupHCCmd,
startupHCCmdFlagName, "",
"Set a startup healthcheck command for the container",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCCmdFlagName, completion.AutocompleteNone)

startupHCIntervalFlagName := "health-startup-interval"
createFlags.StringVar(
&cf.StartupHCInterval,
startupHCIntervalFlagName, define.DefaultHealthCheckInterval,
"Set an interval for the startup healthcheck",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCIntervalFlagName, completion.AutocompleteNone)

startupHCRetriesFlagName := "health-startup-retries"
createFlags.UintVar(
&cf.StartupHCRetries,
startupHCRetriesFlagName, 0,
"Set the maximum number of retries before the startup healthcheck will restart the container",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCRetriesFlagName, completion.AutocompleteNone)

startupHCSuccessesFlagName := "health-startup-success"
createFlags.UintVar(
&cf.StartupHCSuccesses,
startupHCSuccessesFlagName, 0,
"Set the number of consecutive successes before the startup healthcheck is marked as successful and the normal healthcheck begins (0 indicates any success will start the regular healthcheck)",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCSuccessesFlagName, completion.AutocompleteNone)

startupHCTimeoutFlagName := "health-startup-timeout"
createFlags.StringVar(
&cf.StartupHCTimeout,
startupHCTimeoutFlagName, define.DefaultHealthCheckTimeout,
"Set the maximum amount of time that the startup healthcheck may take before it is considered failed",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCTimeoutFlagName, completion.AutocompleteNone)

stopSignalFlagName := "stop-signal"
createFlags.StringVar(
&cf.StopSignal,
Expand Down
2 changes: 1 addition & 1 deletion cmd/podman/healthcheck/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func run(cmd *cobra.Command, args []string) error {
if err != nil {
return err
}
if response.Status == define.HealthCheckUnhealthy {
if response.Status == define.HealthCheckUnhealthy || response.Status == define.HealthCheckStarting {
registry.SetExitCode(1)
fmt.Println(response.Status)
}
Expand Down
11 changes: 11 additions & 0 deletions docs/source/markdown/options/health-startup-cmd.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-cmd**=*"command"* | *'["command", "arg1", ...]'*

Set a startup healthcheck command for a container. This command will be executed inside the container and is used to gate the regular
healthcheck. When the startup command succeeds, the regular healthcheck will begin and the startup healthcheck will cease. Optionally,
if the command fails for a set number of attempts, the container will be restarted. A startup healthcheck can be used to ensure that
containers with an extended startup period are not marked as unhealthy until they are fully started. Startup healthchecks can only be
used when a regular healthcheck (from the container's image or the **--health-cmd** option) is also set.
7 changes: 7 additions & 0 deletions docs/source/markdown/options/health-startup-interval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-interval**=*interval*

Set an interval for the startup healthcheck. An _interval_ of **disable** results in no automatic timer setup. The default is **30s**.
8 changes: 8 additions & 0 deletions docs/source/markdown/options/health-startup-retries.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-retries**=*retries*

The number of attempts allowed before the startup healthcheck restarts the container. If set to **0**, the container will never be
restarted. The default is **0**.
8 changes: 8 additions & 0 deletions docs/source/markdown/options/health-startup-success.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-success**=*retries*

The number of successful runs required before the startup healthcheck will succeed and the regular healthcheck will begin. A value
of **0** means that any success will begin the regular healthcheck. The default is **0**.
8 changes: 8 additions & 0 deletions docs/source/markdown/options/health-startup-timeout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-timeout**=*timeout*

The maximum time a startup healthcheck command has to complete before it is marked as failed. The value can be expressed in a time
format like **2m3s**. The default value is **30s**.
10 changes: 10 additions & 0 deletions docs/source/markdown/podman-create.1.md.in
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,16 @@ See [**Environment**](#environment) note below for precedence and examples.

@@option health-start-period

@@option health-startup-cmd

@@option health-startup-interval

@@option health-startup-retries

@@option health-startup-success

@@option health-startup-timeout

@@option health-timeout

#### **--help**
Expand Down
10 changes: 10 additions & 0 deletions docs/source/markdown/podman-run.1.md.in
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,16 @@ See [**Environment**](#environment) note below for precedence and examples.

@@option health-start-period

@@option health-startup-cmd

@@option health-startup-interval

@@option health-startup-retries

@@option health-startup-success

@@option health-startup-timeout

@@option health-timeout

#### **--help**
Expand Down
26 changes: 26 additions & 0 deletions libpod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,18 @@ type ContainerState struct {
// (only by restart policy).
RestartCount uint `json:"restartCount,omitempty"`

// StartupHCPassed indicates that the startup healthcheck has
// succeeded and the main healthcheck can begin.
StartupHCPassed bool `json:"startupHCPassed,omitempty"`
// StartupHCSuccessCount indicates the number of successes of the
// startup healthcheck. A startup HC can require more than one success
// to be marked as passed.
StartupHCSuccessCount int `json:"startupHCSuccessCount,omitempty"`
// StartupHCFailureCount indicates the number of failures of the startup
// healthcheck. The container will be restarted if this exceed a set
// number in the startup HC config.
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`

// ExtensionStageHooks holds hooks which will be executed by libpod
// and not delegated to the OCI runtime.
ExtensionStageHooks map[string][]spec.Hook `json:"extensionStageHooks,omitempty"`
Expand Down Expand Up @@ -929,6 +941,20 @@ func (c *Container) StoppedByUser() (bool, error) {
return c.state.StoppedByUser, nil
}

// StartupHCPassed returns whether the container's startup healthcheck passed.
func (c *Container) StartupHCPassed() (bool, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()

if err := c.syncContainer(); err != nil {
return false, err
}
}

return c.state.StartupHCPassed, nil
}

// Misc Accessors
// Most will require locking

Expand Down
4 changes: 4 additions & 0 deletions libpod/container_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,10 @@ type ContainerMiscConfig struct {
HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"`
// HealthCheckOnFailureAction defines an action to take once the container turns unhealthy.
HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"`
// StartupHealthCheckConfig is the configuration of the startup
// healthcheck for the container. This will run before the regular HC
// runs, and when it passes the regular HC will be activated.
StartupHealthCheckConfig *define.StartupHealthCheck `json:"startupHealthCheck,omitempty"`
// PreserveFDs is a number of additional file descriptors (in addition
// to 0, 1, 2) that will be passed to the executed process. The total FDs
// passed will be 3 + PreserveFDs.
Expand Down
18 changes: 14 additions & 4 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,9 @@ func resetState(state *ContainerState) {
state.CheckpointPath = ""
state.CheckpointLog = ""
state.RestoreLog = ""
state.StartupHCPassed = false
state.StartupHCSuccessCount = 0
state.StartupHCFailureCount = 0
}

// Refresh refreshes the container's state after a restart.
Expand Down Expand Up @@ -1072,6 +1075,9 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
c.state.State = define.ContainerStateCreated
c.state.StoppedByUser = false
c.state.RestartPolicyMatch = false
c.state.StartupHCFailureCount = 0
c.state.StartupHCSuccessCount = 0
c.state.StartupHCPassed = false

if !retainRetries {
c.state.RestartCount = 0
Expand All @@ -1091,7 +1097,11 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
}

if c.config.HealthCheckConfig != nil {
if err := c.createTimer(); err != nil {
timer := c.config.HealthCheckConfig.Interval.String()
if c.config.StartupHealthCheckConfig != nil {
timer = c.config.StartupHealthCheckConfig.Interval.String()
}
if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err)
}
}
Expand Down Expand Up @@ -1244,7 +1254,7 @@ func (c *Container) start() error {
if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil {
logrus.Error(err)
}
if err := c.startTimer(); err != nil {
if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err)
}
}
Expand Down Expand Up @@ -1422,7 +1432,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
return err
}
if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background()); err != nil {
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Error(err.Error())
}
}
Expand Down Expand Up @@ -1859,7 +1869,7 @@ func (c *Container) cleanup(ctx context.Context) error {

// Remove healthcheck unit/timer file if it execs
if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx); err != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err)
}
}
Expand Down
5 changes: 5 additions & 0 deletions libpod/container_validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ func (c *Container) validate() error {
}
}

// Cannot set startup HC without a healthcheck
if c.config.HealthCheckConfig == nil && c.config.StartupHealthCheckConfig != nil {
return fmt.Errorf("cannot set a startup healthcheck when there is no regular healthcheck: %w", define.ErrInvalidArg)
}

return nil
}

Expand Down
14 changes: 14 additions & 0 deletions libpod/define/healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package define
import (
"fmt"
"strings"

"github.com/containers/image/v5/manifest"
)

const (
Expand Down Expand Up @@ -38,6 +40,9 @@ const (
HealthCheckInternalError HealthCheckStatus = iota
// HealthCheckDefined means the healthcheck was found on the container
HealthCheckDefined HealthCheckStatus = iota
// HealthCheckStartup means the healthcheck was unhealthy, but is still
// either within the startup HC or the startup period of the healthcheck
HealthCheckStartup HealthCheckStatus = iota
)

// Healthcheck defaults. These are used both in the cli as well in
Expand Down Expand Up @@ -131,3 +136,12 @@ func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, erro
return HealthCheckOnFailureActionInvalid, err
}
}

// StartupHealthCheck is the configuration of a startup healthcheck.
type StartupHealthCheck struct {
manifest.Schema2HealthConfig
// Successes are the number of successes required to mark the startup HC
// as passed.
// If set to 0, a single success will mark the HC as passed.
Successes int `json:",omitempty"`
}
Loading

0 comments on commit d161293

Please sign in to comment.