Skip to content

Commit

Permalink
healthcheck: wait for systemd operations
Browse files Browse the repository at this point in the history
Make sure to wait for the systemd operations to finish when
starting/stopping healtcheck timers and services.  Also make
sure to stop the timer before the service to avoid a race
with the timer.

[NO NEW TESTS NEEDED] since it is a non-functional change and existing
tests are expected to pass.

Signed-off-by: Valentin Rothberg <[email protected]>
  • Loading branch information
vrothberg committed May 27, 2022
1 parent 800a367 commit f23ae4d
Showing 1 changed file with 49 additions and 21 deletions.
70 changes: 49 additions & 21 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os/exec"
"strings"

"github.com/containers/podman/v4/pkg/errorhandling"
"github.com/containers/podman/v4/pkg/rootless"
"github.com/containers/podman/v4/pkg/systemd"
"github.com/pkg/errors"
Expand Down Expand Up @@ -46,6 +47,17 @@ func (c *Container) createTimer() error {
return nil
}

// Wait for a message on the channel. Throw an error if the message is not "done".
func systemdOpSuccessful(c chan string) error {
msg := <-c
switch msg {
case "done":
return nil
default:
return fmt.Errorf("expected %q but received %q", "done", msg)
}
}

// startTimer starts a systemd timer for the healthchecks
func (c *Container) startTimer() error {
if c.disableHealthCheckSystemd() {
Expand All @@ -56,8 +68,17 @@ func (c *Container) startTimer() error {
return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
}
defer conn.Close()
_, err = conn.StartUnitContext(context.Background(), fmt.Sprintf("%s.service", c.ID()), "fail", nil)
return err

startFile := fmt.Sprintf("%s.service", c.ID())
startChan := make(chan string)
if _, err := conn.StartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
return err
}
if err := systemdOpSuccessful(startChan); err != nil {
return fmt.Errorf("starting systemd health-check timer %q: %w", startFile, err)
}

return nil
}

// removeTransientFiles removes the systemd timer and unit files
Expand All @@ -71,30 +92,37 @@ func (c *Container) removeTransientFiles(ctx context.Context) error {
return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
}
defer conn.Close()

// Errors are returned at the very end. Let's make sure to stop and
// clean up as much as possible.
stopErrors := []error{}

// Stop the timer before the service to make sure the timer does not
// fire after the service is stopped.
timerChan := make(chan string)
timerFile := fmt.Sprintf("%s.timer", c.ID())
serviceFile := fmt.Sprintf("%s.service", c.ID())
if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil {
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
}
} else if err := systemdOpSuccessful(timerChan); err != nil {
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
}

// If the service has failed (the healthcheck has failed), then
// the .service file is not removed on stopping the unit file. If
// we check the properties of the service, it will automatically
// reset the state. But checking the state takes msecs vs usecs to
// blindly call reset.
// Reset the service before stopping it to make sure it's being removed
// on stop.
serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", c.ID())
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("failed to reset unit file: %q", err)
logrus.Debugf("Failed to reset unit file: %q", err)
}

// We want to ignore errors where the timer unit and/or service unit has already
// been removed. The error return is generic so we have to check against the
// string in the error
if _, err = conn.StopUnitContext(ctx, serviceFile, "fail", nil); err != nil {
if _, err := conn.StopUnitContext(ctx, serviceFile, "fail", serviceChan); err != nil {
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
return errors.Wrapf(err, "unable to remove service file")
}
}
if _, err = conn.StopUnitContext(ctx, timerFile, "fail", nil); err != nil {
if strings.HasSuffix(err.Error(), ".timer not loaded.") {
return nil
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
}
} else if err := systemdOpSuccessful(serviceChan); err != nil {
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
}
return err

return errorhandling.JoinErrors(stopErrors)
}

0 comments on commit f23ae4d

Please sign in to comment.