Skip to content

Commit

Permalink
libpod: do not leak systemd hc startup unit timer
Browse files Browse the repository at this point in the history
This fixes a regression added in commit 4fd8419, because the name was
overwritten by the createTimer() timer call the removeTransientFiles()
call removed the new timer and not the startup healthcheck. And then
when the container was stopped we leaked it as the wrong unit name was
in the state.

A new test has been added to ensure the logic works and we never leak
the system timers.

Fixes containers#22884

Signed-off-by: Paul Holzinger <[email protected]>
  • Loading branch information
Luap99 authored and mheon committed Jun 4, 2024
1 parent d79b94c commit aa031ad
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 8 deletions.
12 changes: 9 additions & 3 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,9 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(ctx,
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
return false, err
}
}
Expand Down Expand Up @@ -1579,7 +1581,9 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(context.Background(),
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
logrus.Error(err.Error())
}
}
Expand Down Expand Up @@ -2033,7 +2037,9 @@ func (c *Container) cleanup(ctx context.Context) error {

// Remove healthcheck unit/timer file if it execs
if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(ctx,
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err)
}
}
Expand Down
3 changes: 2 additions & 1 deletion libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
if recreateTimer {
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())

oldUnit := c.state.HCUnitName
// Create the new, standard healthcheck timer first.
if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil {
logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err)
Expand All @@ -291,7 +292,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
// Which happens to be us.
// So this has to be last - after this, systemd serves us a
// SIGTERM and we exit.
if err := c.removeTransientFiles(ctx, true); err != nil {
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
return
}
Expand Down
3 changes: 1 addition & 2 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
if c.disableHealthCheckSystemd(isStartup) {
return nil
}
Expand All @@ -122,7 +122,6 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// clean up as much as possible.
stopErrors := []error{}

unitName := c.state.HCUnitName
if unitName == "" {
unitName = c.hcUnitName(isStartup, true)
}
Expand Down
2 changes: 1 addition & 1 deletion libpod/healthcheck_nosystemd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
return nil
}
2 changes: 1 addition & 1 deletion libpod/healthcheck_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
return nil
}
12 changes: 12 additions & 0 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#

load helpers
load helpers.systemd


# Helper function: run 'podman inspect' and check various given fields
Expand Down Expand Up @@ -38,7 +39,10 @@ function _check_health {
--health-interval 1s \
--health-retries 3 \
--health-on-failure=kill \
--health-startup-cmd /home/podman/healthcheck \
--health-startup-interval 1s \
$IMAGE /home/podman/pause
cid="$output"

run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "kill" "on-failure action is set to kill"
Expand Down Expand Up @@ -70,6 +74,10 @@ Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
" "$current_time" "healthy"

# Check that we now we do have valid podman units with this
# name so that the leak check below does not turn into a NOP without noticing.
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"

current_time=$(date --iso-8601=seconds)
# After three successive failures, container should no longer be healthy
sleep 5
Expand All @@ -85,6 +93,10 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"

# Clean up
run_podman rm -t 0 -f healthcheck_c

# Important check for https://github.com/containers/podman/issues/22884
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
}

@test "podman healthcheck - restart cleans up old state" {
Expand Down

0 comments on commit aa031ad

Please sign in to comment.