diff --git a/libpod/container_internal.go b/libpod/container_internal.go index dd93690f40..d81e2512ec 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -229,6 +229,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error { } func (c *Container) shouldRestart() bool { + if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart { + isUnhealthy, err := c.isUnhealthy() + if err != nil { + logrus.Errorf("Checking if container is unhealthy: %v", err) + } else if isUnhealthy { + return true + } + } + // If we did not get a restart policy match, return false // Do the same if we're not a policy that restarts. if !c.state.RestartPolicyMatch || @@ -268,6 +277,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err return false, err } + if c.config.HealthCheckConfig != nil { + if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { + return false, err + } + } + // Is the container running again? // If so, we don't have to do anything if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { @@ -1429,6 +1444,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr if err := c.stop(timeout); err != nil { return err } + if c.config.HealthCheckConfig != nil { if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { logrus.Error(err.Error()) diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 63f22ef9fb..987b02d934 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error { } case define.HealthCheckOnFailureActionRestart: - if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil { - return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err) + // We let the cleanup process handle the restart. Otherwise + // the container would be restarted in the context of a + // transient systemd unit which may cause undesired side + // effects. + if err := c.Stop(); err != nil { + return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err) } case define.HealthCheckOnFailureActionStop: @@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error { return os.WriteFile(c.healthCheckLogPath(), newResults, 0700) } +// isUnhealthy returns if the current health check status in unhealthy. +func (c *Container) isUnhealthy() (bool, error) { + if !c.HasHealthCheck() { + return false, nil + } + healthCheck, err := c.getHealthCheckLog() + if err != nil { + return false, err + } + return healthCheck.Status == define.HealthCheckUnhealthy, nil +} + // UpdateHealthCheckLog parses the health check results and writes the log func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) { c.lock.Lock() diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index 811f07709a..828232b39d 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -139,19 +139,22 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" run_podman 1 healthcheck run $ctr is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)" - run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}" if [[ $policy == "restart" ]];then - # Container has been restarted and health check works again - is "$output" "running $policy" "container has been restarted" + # Make sure the container transitions back to running + run_podman wait --condition=running $ctr + run_podman inspect $ctr --format "{{.RestartCount}}" + assert "${#lines[@]}" != 0 "Container has been restarted at least once" run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}" is "$output" "0" "Failing streak of restarted container should be 0 again" run_podman healthcheck run $ctr elif [[ $policy == "none" ]];then + run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}" # Container is still running and health check still broken is "$output" "running $policy" "container continued running" run_podman 1 healthcheck run $ctr is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)" else + run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}" # kill and stop yield the container into a non-running state is "$output" ".* $policy" "container was stopped/killed (policy: $policy)" assert "$output" != "running $policy" @@ -164,4 +167,39 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" done } +@test "podman healthcheck --health-on-failure with interval" { + ctr="healthcheck_c" + + for policy in stop kill restart ;do + t0=$(date --iso-8601=seconds) + run_podman run -d --name $ctr \ + --health-cmd /bin/false \ + --health-retries=1 \ + --health-on-failure=$policy \ + --health-interval=1s \ + $IMAGE top + + if [[ $policy == "restart" ]];then + # Sleeping for 2 seconds makes the test much faster than using + # podman-wait which would compete with the container getting + # restarted. + sleep 2 + # Make sure the container transitions back to running + run_podman wait --condition=running $ctr + run_podman inspect $ctr --format "{{.RestartCount}}" + assert "${#lines[@]}" != 0 "Container has been restarted at least once" + else + # kill and stop yield the container into a non-running state + run_podman wait $ctr + run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}" + is "$output" ".* $policy" "container was stopped/killed (policy: $policy)" + assert "$output" != "running $policy" + # also make sure that it's not stuck in the stopping state + assert "$output" != "stopping $policy" + fi + + run_podman rm -f -t0 $ctr + done +} + # vim: filetype=sh