Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

libpod: do not leak systemd hc startup unit timer #22895

Merged
merged 1 commit into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,9 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(ctx,
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
return false, err
}
}
Expand Down Expand Up @@ -1590,7 +1592,9 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
}

if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(context.Background(),
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
logrus.Error(err.Error())
}
}
Expand Down Expand Up @@ -2047,7 +2051,9 @@ func (c *Container) cleanup(ctx context.Context) error {

// Remove healthcheck unit/timer file if it execs
if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
if err := c.removeTransientFiles(ctx,
c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed,
c.state.HCUnitName); err != nil {
logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err)
}
}
Expand Down
3 changes: 2 additions & 1 deletion libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
if recreateTimer {
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())

oldUnit := c.state.HCUnitName
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need a temporary variable for this value?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is the whole point of the bug fix, as written in the commit message createTimer() will overwrite this with a new value thus removeTransientFiles() removed the new unit not the old one.

Maybe I should have added this in a comment there to make it clear

// Create the new, standard healthcheck timer first.
if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil {
logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err)
Expand All @@ -291,7 +292,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
// Which happens to be us.
// So this has to be last - after this, systemd serves us a
// SIGTERM and we exit.
if err := c.removeTransientFiles(ctx, true); err != nil {
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
return
}
Expand Down
3 changes: 1 addition & 2 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
if c.disableHealthCheckSystemd(isStartup) {
return nil
}
Expand All @@ -122,7 +122,6 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// clean up as much as possible.
stopErrors := []error{}

unitName := c.state.HCUnitName
if unitName == "" {
unitName = c.hcUnitName(isStartup, true)
}
Expand Down
2 changes: 1 addition & 1 deletion libpod/healthcheck_nosystemd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
return nil
}
2 changes: 1 addition & 1 deletion libpod/healthcheck_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error {

// removeTransientFiles removes the systemd timer and unit files
// for the container
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
return nil
}
12 changes: 12 additions & 0 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#

load helpers
load helpers.systemd


# Helper function: run 'podman inspect' and check various given fields
Expand Down Expand Up @@ -38,7 +39,10 @@ function _check_health {
--health-interval 1s \
--health-retries 3 \
--health-on-failure=kill \
--health-startup-cmd /home/podman/healthcheck \
--health-startup-interval 1s \
$IMAGE /home/podman/pause
cid="$output"

run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "kill" "on-failure action is set to kill"
Expand Down Expand Up @@ -70,6 +74,10 @@ Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
" "$current_time" "healthy"

# Check that we now we do have valid podman units with this
# name so that the leak check below does not turn into a NOP without noticing.
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"

current_time=$(date --iso-8601=seconds)
# After three successive failures, container should no longer be healthy
sleep 5
Expand All @@ -85,6 +93,10 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"

# Clean up
run_podman rm -t 0 -f healthcheck_c

# Important check for https://github.com/containers/podman/issues/22884
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
}

@test "podman healthcheck - restart cleans up old state" {
Expand Down