Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v4.4] fix --health-on-failure=restart in transient unit #17862

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
}

func (c *Container) shouldRestart() bool {
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
isUnhealthy, err := c.isUnhealthy()
if err != nil {
logrus.Errorf("Checking if container is unhealthy: %v", err)
} else if isUnhealthy {
return true
}
}

// If we did not get a restart policy match, return false
// Do the same if we're not a policy that restarts.
if !c.state.RestartPolicyMatch ||
Expand Down Expand Up @@ -267,6 +276,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
return false, err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
return false, err
}
}

// Is the container running again?
// If so, we don't have to do anything
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
Expand Down Expand Up @@ -1431,6 +1446,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
if err := c.stop(timeout); err != nil {
return err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Error(err.Error())
Expand Down
20 changes: 18 additions & 2 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error {
}

case define.HealthCheckOnFailureActionRestart:
if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
// We let the cleanup process handle the restart. Otherwise
// the container would be restarted in the context of a
// transient systemd unit which may cause undesired side
// effects.
if err := c.Stop(); err != nil {
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
}

case define.HealthCheckOnFailureActionStop:
Expand Down Expand Up @@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error {
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
}

// isUnhealthy returns if the current health check status in unhealthy.
func (c *Container) isUnhealthy() (bool, error) {
if !c.HasHealthCheck() {
return false, nil
}
healthCheck, err := c.getHealthCheckLog()
if err != nil {
return false, err
}
return healthCheck.Status == define.HealthCheckUnhealthy, nil
}

// UpdateHealthCheckLog parses the health check results and writes the log
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
c.lock.Lock()
Expand Down
56 changes: 47 additions & 9 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -139,20 +139,23 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"

run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
if [[ $policy == "restart" ]];then
# Container has been restarted and health check works again
is "$output" "running $policy" "container has been restarted"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman healthcheck run $ctr
if [[ $policy == "restart" ]];then
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman healthcheck run $ctr
elif [[ $policy == "none" ]];then
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# Container is still running and health check still broken
is "$output" "running $policy" "container continued running"
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
else
# kill and stop yield the container into a non-running state
else
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# kill and stop yield the container into a non-running state
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
Expand All @@ -164,4 +167,39 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
done
}

@test "podman healthcheck --health-on-failure with interval" {
ctr="healthcheck_c"

for policy in stop kill restart ;do
t0=$(date --iso-8601=seconds)
run_podman run -d --name $ctr \
--health-cmd /bin/false \
--health-retries=1 \
--health-on-failure=$policy \
--health-interval=1s \
$IMAGE top

if [[ $policy == "restart" ]];then
# Sleeping for 2 seconds makes the test much faster than using
# podman-wait which would compete with the container getting
# restarted.
sleep 2
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
else
# kill and stop yield the container into a non-running state
run_podman wait $ctr
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
assert "$output" != "stopping $policy"
fi

run_podman rm -f -t0 $ctr
done
}

# vim: filetype=sh