From 2ebc9004f40c20916e94ad2e3538571b058c40b5 Mon Sep 17 00:00:00 2001 From: Matthew Heon Date: Wed, 7 Jun 2023 14:44:21 -0400 Subject: [PATCH] Ignore spurious warnings when killing containers There are certain messages logged by OCI runtimes when killing a container that has already stopped that we really do not care about when stopping a container. Due to our architecture, there are inherent races around stopping containers, and so we cannot guarantee that *we* are the people to kill it - but that doesn't matter because Podman only cares that the container has stopped, not who delivered the fatal signal. Unfortunately, the OCI runtimes don't understand this, and log various warning messages when the `kill` command is invoked on a container that was already dead. These cause our tests to fail, as we now check for clean STDERR when running Podman. To work around this, capture STDERR for the OCI runtime in a buffer only for stopping containers, and go through and discard any of the warnings we identified as spurious. Signed-off-by: Matthew Heon --- libpod/oci_conmon_common.go | 106 ++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index d803276b15..95065bf28c 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -356,10 +356,20 @@ func generateResourceFile(res *spec.LinuxResources) (string, []string, error) { // If all is set, send to all PIDs in the container. // All is only supported if the container created cgroups. func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + if _, err := r.killContainer(ctr, signal, all, false); err != nil { + return err + } + + return nil +} + +// If captureStderr is requested, OCI runtime STDERR will be captured as a +// *bytes.buffer and returned; otherwise, it is set to os.Stderr. +func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captureStderr bool) (*bytes.Buffer, error) { logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) runtimeDir, err := util.GetRuntimeDir() if err != nil { - return err + return nil, err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} var args []string @@ -369,19 +379,27 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) } else { args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + var ( + stderr io.Writer = os.Stderr + stderrBuffer *bytes.Buffer + ) + if captureStderr { + stderrBuffer = new(bytes.Buffer) + stderr = stderrBuffer + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, stderr, env, r.path, args...); err != nil { // Update container state - there's a chance we failed because // the container exited in the meantime. if err2 := r.UpdateContainerStatus(ctr); err2 != nil { logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) } if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State) + return stderrBuffer, fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State) } - return fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err) + return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err) } - return nil + return stderrBuffer, nil } // StopContainer stops a container, first using its given stop signal (or @@ -400,23 +418,65 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) return nil } - if timeout > 0 { - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) + killCtr := func(signal uint) (bool, error) { + stderr, err := r.killContainer(ctr, signal, all, true) + + // Before handling error from KillContainer, convert STDERR to a []string + // (one string per line of output) and print it, ignoring known OCI runtime + // errors that we don't care about + stderrLines := strings.Split(stderr.String(), "\n") + for _, line := range stderrLines { + if line == "" { + continue + } + if strings.Contains(line, "container not running") || strings.Contains(line, "open pidfd: No such process") { + logrus.Debugf("Failure to kill container (already stopped?): logged %s", line) + continue + } + fmt.Fprintf(os.Stderr, "%s\n", line) } - if err := r.KillContainer(ctr, stopSignal, all); err != nil { + + if err != nil { + // There's an inherent race with the cleanup process (see + // #16142, #17142). If the container has already been marked as + // stopped or exited by the cleanup process, we can return + // immediately. + if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return true, nil + } + + // If the PID is 0, then the container is already stopped. + if ctr.state.PID == 0 { + return true, nil + } + // Is the container gone? // If so, it probably died between the first check and // our sending the signal // The container is stopped, so exit cleanly err := unix.Kill(ctr.state.PID, 0) if err == unix.ESRCH { - return nil + return true, nil } + return false, err + } + return false, nil + } + + if timeout > 0 { + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + stopped, err := killCtr(stopSignal) + if err != nil { return err } + if stopped { + return nil + } if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) @@ -427,27 +487,13 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) } } - // If the timeout was set to 0 or if stopping the container with the - // specified signal did not work, use the big hammer with SIGKILL. - if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { - // There's an inherent race with the cleanup process (see - // #16142, #17142). If the container has already been marked as - // stopped or exited by the cleanup process, we can return - // immediately. - if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return nil - } - - // If the PID is 0, then the container is already stopped. - if ctr.state.PID == 0 { - return nil - } - // Again, check if the container is gone. If it is, exit cleanly. - if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { - return nil - } + stopped, err := killCtr(uint(unix.SIGKILL)) + if err != nil { return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) } + if stopped { + return nil + } // Give runtime a few seconds to make it happen if err := waitContainerStop(ctr, killContainerTimeout); err != nil {