Skip to content

Commit

Permalink
[v4.4] fix --health-on-failure=restart in transient unit
Browse files Browse the repository at this point in the history
As described in #17777, the `restart` on-failure action did not behave
correctly when the health check is being run by a transient systemd
unit.  It ran just fine when being executed outside such a unit, for
instance, manually or, as done in the system tests, in a scripted
fashion.

There were two issue causing the `restart` on-failure action to
misbehave:

1) The transient systemd units used the default `KillMode=cgroup` which
   will nuke all processes in the specific cgroup including the recently
   restarted container/conmon once the main `podman healthcheck run`
   process exits.

2) Podman attempted to remove the transient systemd unit and timer
   during restart.  That is perfectly fine when manually restarting the
   container but not when the restart itself is being executed inside
   such a transient unit.  Ultimately, Podman tried to shoot itself in
   the foot.

Fix both issues by moving the restart logic in the cleanup process.
Instead of restarting the container, the `healthcheck run` will just
stop the container and the cleanup process will restart the container
once it has turned unhealthy.

Backport of commit 9563415.

Fixes: #17777
Signed-off-by: Valentin Rothberg <[email protected]>
  • Loading branch information
vrothberg committed Mar 21, 2023
1 parent 5106bbf commit 06925d5
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 11 deletions.
16 changes: 16 additions & 0 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
}

func (c *Container) shouldRestart() bool {
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
isUnhealthy, err := c.isUnhealthy()
if err != nil {
logrus.Errorf("Checking if container is unhealthy: %v", err)
} else if isUnhealthy {
return true
}
}

// If we did not get a restart policy match, return false
// Do the same if we're not a policy that restarts.
if !c.state.RestartPolicyMatch ||
Expand Down Expand Up @@ -267,6 +276,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
return false, err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
return false, err
}
}

// Is the container running again?
// If so, we don't have to do anything
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
Expand Down Expand Up @@ -1431,6 +1446,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
if err := c.stop(timeout); err != nil {
return err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Error(err.Error())
Expand Down
20 changes: 18 additions & 2 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error {
}

case define.HealthCheckOnFailureActionRestart:
if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
// We let the cleanup process handle the restart. Otherwise
// the container would be restarted in the context of a
// transient systemd unit which may cause undesired side
// effects.
if err := c.Stop(); err != nil {
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
}

case define.HealthCheckOnFailureActionStop:
Expand Down Expand Up @@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error {
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
}

// isUnhealthy returns if the current health check status in unhealthy.
func (c *Container) isUnhealthy() (bool, error) {
if !c.HasHealthCheck() {
return false, nil
}
healthCheck, err := c.getHealthCheckLog()
if err != nil {
return false, err
}
return healthCheck.Status == define.HealthCheckUnhealthy, nil
}

// UpdateHealthCheckLog parses the health check results and writes the log
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
c.lock.Lock()
Expand Down
56 changes: 47 additions & 9 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -139,20 +139,23 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"

run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
if [[ $policy == "restart" ]];then
# Container has been restarted and health check works again
is "$output" "running $policy" "container has been restarted"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman healthcheck run $ctr
if [[ $policy == "restart" ]];then
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman healthcheck run $ctr
elif [[ $policy == "none" ]];then
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# Container is still running and health check still broken
is "$output" "running $policy" "container continued running"
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
else
# kill and stop yield the container into a non-running state
else
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# kill and stop yield the container into a non-running state
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
Expand All @@ -164,4 +167,39 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
done
}

@test "podman healthcheck --health-on-failure with interval" {
ctr="healthcheck_c"

for policy in stop kill restart ;do
t0=$(date --iso-8601=seconds)
run_podman run -d --name $ctr \
--health-cmd /bin/false \
--health-retries=1 \
--health-on-failure=$policy \
--health-interval=1s \
$IMAGE top

if [[ $policy == "restart" ]];then
# Sleeping for 2 seconds makes the test much faster than using
# podman-wait which would compete with the container getting
# restarted.
sleep 2
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
else
# kill and stop yield the container into a non-running state
run_podman wait $ctr
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
assert "$output" != "stopping $policy"
fi

run_podman rm -f -t0 $ctr
done
}

# vim: filetype=sh

0 comments on commit 06925d5

Please sign in to comment.