From f23ae4d660d1099cacf71cc09c9b053abbcdf8ca Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Fri, 27 May 2022 13:04:54 +0200 Subject: [PATCH] healthcheck: wait for systemd operations Make sure to wait for the systemd operations to finish when starting/stopping healtcheck timers and services. Also make sure to stop the timer before the service to avoid a race with the timer. [NO NEW TESTS NEEDED] since it is a non-functional change and existing tests are expected to pass. Signed-off-by: Valentin Rothberg --- libpod/healthcheck_linux.go | 70 ++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 45b3a0e41d..1e03db5420 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -7,6 +7,7 @@ import ( "os/exec" "strings" + "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/systemd" "github.com/pkg/errors" @@ -46,6 +47,17 @@ func (c *Container) createTimer() error { return nil } +// Wait for a message on the channel. Throw an error if the message is not "done". +func systemdOpSuccessful(c chan string) error { + msg := <-c + switch msg { + case "done": + return nil + default: + return fmt.Errorf("expected %q but received %q", "done", msg) + } +} + // startTimer starts a systemd timer for the healthchecks func (c *Container) startTimer() error { if c.disableHealthCheckSystemd() { @@ -56,8 +68,17 @@ func (c *Container) startTimer() error { return errors.Wrapf(err, "unable to get systemd connection to start healthchecks") } defer conn.Close() - _, err = conn.StartUnitContext(context.Background(), fmt.Sprintf("%s.service", c.ID()), "fail", nil) - return err + + startFile := fmt.Sprintf("%s.service", c.ID()) + startChan := make(chan string) + if _, err := conn.StartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { + return err + } + if err := systemdOpSuccessful(startChan); err != nil { + return fmt.Errorf("starting systemd health-check timer %q: %w", startFile, err) + } + + return nil } // removeTransientFiles removes the systemd timer and unit files @@ -71,30 +92,37 @@ func (c *Container) removeTransientFiles(ctx context.Context) error { return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks") } defer conn.Close() + + // Errors are returned at the very end. Let's make sure to stop and + // clean up as much as possible. + stopErrors := []error{} + + // Stop the timer before the service to make sure the timer does not + // fire after the service is stopped. + timerChan := make(chan string) timerFile := fmt.Sprintf("%s.timer", c.ID()) - serviceFile := fmt.Sprintf("%s.service", c.ID()) + if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil { + if !strings.HasSuffix(err.Error(), ".timer not loaded.") { + stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err)) + } + } else if err := systemdOpSuccessful(timerChan); err != nil { + stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err)) + } - // If the service has failed (the healthcheck has failed), then - // the .service file is not removed on stopping the unit file. If - // we check the properties of the service, it will automatically - // reset the state. But checking the state takes msecs vs usecs to - // blindly call reset. + // Reset the service before stopping it to make sure it's being removed + // on stop. + serviceChan := make(chan string) + serviceFile := fmt.Sprintf("%s.service", c.ID()) if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil { - logrus.Debugf("failed to reset unit file: %q", err) + logrus.Debugf("Failed to reset unit file: %q", err) } - - // We want to ignore errors where the timer unit and/or service unit has already - // been removed. The error return is generic so we have to check against the - // string in the error - if _, err = conn.StopUnitContext(ctx, serviceFile, "fail", nil); err != nil { + if _, err := conn.StopUnitContext(ctx, serviceFile, "fail", serviceChan); err != nil { if !strings.HasSuffix(err.Error(), ".service not loaded.") { - return errors.Wrapf(err, "unable to remove service file") - } - } - if _, err = conn.StopUnitContext(ctx, timerFile, "fail", nil); err != nil { - if strings.HasSuffix(err.Error(), ".timer not loaded.") { - return nil + stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err)) } + } else if err := systemdOpSuccessful(serviceChan); err != nil { + stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err)) } - return err + + return errorhandling.JoinErrors(stopErrors) }