Skip to content

Commit

Permalink
Add a random suffix to healthcheck unit names
Browse files Browse the repository at this point in the history
Systemd dislikes it when we rapidly create and remove a transient
unit. Solution: If we change the name every time, it's different
enough that systemd is satisfied and we stop having errors trying
to restart the healthcheck.

Generate a random 32-bit integer, and add it (formatted as hex)
to the end of the unit name to do this. As a result, we now have
to store the unit name in the database, but it does make
backwards compat easy - if the unit name in the DB is empty, we
revert to the old behavior because the timer was created by old
Podman.

Should resolve RHEL-26105

Signed-off-by: Matt Heon <[email protected]>
  • Loading branch information
mheon committed May 3, 2024
1 parent 6ec2c0b commit 4fd8419
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 6 deletions.
3 changes: 3 additions & 0 deletions libpod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ type ContainerState struct {
// healthcheck. The container will be restarted if this exceed a set
// number in the startup HC config.
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`
// HCUnitName records the name of the healthcheck unit.
// Automatically generated when the healthcheck is started.
HCUnitName string `json:"hcUnitName,omitempty"`

// ExtensionStageHooks holds hooks which will be executed by libpod
// and not delegated to the OCI runtime.
Expand Down
1 change: 1 addition & 0 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,7 @@ func resetContainerState(state *ContainerState) {
state.StartupHCPassed = false
state.StartupHCSuccessCount = 0
state.StartupHCFailureCount = 0
state.HCUnitName = ""
state.NetNS = ""
state.NetworkStatus = nil
}
Expand Down
40 changes: 34 additions & 6 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package libpod
import (
"context"
"fmt"
"math/rand"
"os"
"os/exec"
"strings"
Expand All @@ -21,6 +22,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) {
return nil
}

hcUnitName := c.hcUnitName(isStartup, false)

podman, err := os.Executable()
if err != nil {
return fmt.Errorf("failed to get path for podman for a health check timer: %w", err)
Expand All @@ -35,7 +39,7 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
cmd = append(cmd, "--setenv=PATH="+path)
}

cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
cmd = append(cmd, "--unit", hcUnitName, fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)

if logrus.IsLevelEnabled(logrus.DebugLevel) {
cmd = append(cmd, "--log-level=debug", "--syslog")
Expand All @@ -53,6 +57,12 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
if output, err := systemdRun.CombinedOutput(); err != nil {
return fmt.Errorf("%s", output)
}

c.state.HCUnitName = hcUnitName
if err := c.save(); err != nil {
return fmt.Errorf("saving container %s healthcheck unit name: %w", c.ID(), err)
}

return nil
}

Expand All @@ -72,13 +82,19 @@ func (c *Container) startTimer(isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) {
return nil
}

hcUnitName := c.state.HCUnitName
if hcUnitName == "" {
hcUnitName = c.hcUnitName(isStartup, true)
}

conn, err := systemd.ConnectToDBUS()
if err != nil {
return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err)
}
defer conn.Close()

startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
startFile := fmt.Sprintf("%s.service", hcUnitName)
startChan := make(chan string)
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
return err
Expand Down Expand Up @@ -106,10 +122,14 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// clean up as much as possible.
stopErrors := []error{}

unitName := c.state.HCUnitName
if unitName == "" {
unitName = c.hcUnitName(isStartup, true)
}
// Stop the timer before the service to make sure the timer does not
// fire after the service is stopped.
timerChan := make(chan string)
timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup))
timerFile := fmt.Sprintf("%s.timer", unitName)
if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil {
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
Expand All @@ -121,7 +141,7 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// Reset the service before stopping it to make sure it's being removed
// on stop.
serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
serviceFile := fmt.Sprintf("%s.service", unitName)
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err)
}
Expand Down Expand Up @@ -151,11 +171,19 @@ func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
return false
}

// Systemd unit name for the healthcheck systemd unit
func (c *Container) hcUnitName(isStartup bool) string {
// Systemd unit name for the healthcheck systemd unit.
// Bare indicates that a random suffix should not be applied to the name. This
// was default behavior previously, and is used for backwards compatibility.
func (c *Container) hcUnitName(isStartup, bare bool) string {
unitName := c.ID()
if isStartup {
unitName += "-startup"
}
if !bare {
// Ensure that unit names are unique from run to run by appending
// a random suffix.
// Ref: RH Jira RHEL-26105
unitName += fmt.Sprintf("-%x", rand.Int())
}
return unitName
}

0 comments on commit 4fd8419

Please sign in to comment.