Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

health checks: make on-failure action retry aware #16084

Merged
merged 1 commit into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 24 additions & 19 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,31 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) {
}

hcStatus, err := checkHealthCheckCanBeRun(container)
if err == nil {
hcStatus, err := container.runHealthCheck()
if err := container.processHealthCheckStatus(hcStatus); err != nil {
return hcStatus, err
}
if err != nil {
return hcStatus, err
}

hcStatus, logStatus, err := container.runHealthCheck()
if err := container.processHealthCheckStatus(logStatus); err != nil {
return hcStatus, err
}
return hcStatus, err
}

// runHealthCheck runs the health check as defined by the container
func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) {
var (
newCommand []string
returnCode int
inStartPeriod bool
)
hcCommand := c.HealthCheckConfig().Test
if len(hcCommand) < 1 {
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
}
switch hcCommand[0] {
case "", define.HealthConfigTestNone:
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
case define.HealthConfigTestCmd:
newCommand = hcCommand[1:]
case define.HealthConfigTestCmdShell:
Expand All @@ -66,11 +67,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
newCommand = hcCommand
}
if len(newCommand) < 1 || newCommand[0] == "" {
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
}
rPipe, wPipe, err := os.Pipe()
if err != nil {
return define.HealthCheckInternalError, fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
}
defer wPipe.Close()
defer rPipe.Close()
Expand Down Expand Up @@ -135,15 +136,16 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
}

hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod)
if err != nil {
return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
}

return hcResult, hcErr
return hcResult, logStatus, hcErr
}

func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error {
if status == define.HealthCheckSuccess {
func (c *Container) processHealthCheckStatus(status string) error {
if status != define.HealthCheckUnhealthy {
return nil
}

Expand Down Expand Up @@ -211,10 +213,13 @@ func (c *Container) updateHealthStatus(status string) error {
}

// UpdateHealthCheckLog parses the health check results and writes the log
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) error {
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
c.lock.Lock()
defer c.lock.Unlock()

healthCheck, err := c.getHealthCheckLog()
if err != nil {
return err
return "", err
}
if hcl.ExitCode == 0 {
// set status to healthy, reset failing state to 0
Expand All @@ -239,9 +244,9 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPerio
}
newResults, err := json.Marshal(healthCheck)
if err != nil {
return fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
}
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
}

// HealthCheckLogPath returns the path for where the health check log is
Expand Down
9 changes: 5 additions & 4 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ function _check_health {
--health-cmd /healthcheck \
--health-interval 1s \
--health-retries 3 \
--health-on-failure=kill \
healthcheck_i

run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "none" "default on-failure action is none"
is "$output" "kill" "on-failure action is set to kill"

# We can't check for 'starting' because a 1-second interval is too
# short; it could run healthcheck before we get to our first check.
Expand Down Expand Up @@ -67,9 +68,8 @@ Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
"

# healthcheck should now fail, with exit status 1 and 'unhealthy' output
run_podman 1 healthcheck run healthcheck_c
is "$output" "unhealthy" "output from 'podman healthcheck run'"
# now the on-failure should kick in and kill the container
podman wait healthcheck_c

# Clean up
run_podman rm -t 0 -f healthcheck_c
Expand All @@ -95,6 +95,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
# Run that healthcheck image.
run_podman run -d --name $ctr \
--health-cmd /healthcheck \
--health-retries=1 \
--health-on-failure=$policy \
$img

Expand Down
1 change: 1 addition & 0 deletions test/system/250-systemd.bats
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
run_podman create --name $cname \
--health-cmd /healthcheck \
--health-on-failure=kill \
--health-retries=1 \
--restart=on-failure \
$img

Expand Down