From 5e53178f5b89769a3afe1c864d7a1a0c90d0ae66 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Thu, 27 Oct 2022 13:49:10 +0200 Subject: [PATCH] kill: resync the container if runtime fails If the runtime fails to kill the container there is fair chance that the container has transitionted to another state or been removed already. Take the lock and resync the container to check for that to prevent reading old and potentially outdated state. [NO NEW TESTS NEEDED] Signed-off-by: Valentin Rothberg --- libpod/container_internal.go | 10 +++++++++- libpod/oci_conmon_common.go | 25 +++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 13feb2ffeb..75448d1759 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1307,7 +1307,15 @@ func (c *Container) stop(timeout uint) error { // We have to check stopErr *after* we lock again - otherwise, we have a // change of panicking on a double-unlock. Ref: GH Issue 9615 - if stopErr != nil { + if errors.Is(stopErr, errSendingSignal) { + // Maybe sending the signal has failed because the container + // exited after unlocking above? So let's check the state. + if !c.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + // Return the error unless the container has been + // stopped or exited. + return stopErr + } + } else if stopErr != nil { return stopErr } diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index f5acf079d9..3c32d84974 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -352,6 +352,8 @@ func generateResourceFile(res *spec.LinuxResources) (string, []string, error) { return f.Name(), flags, nil } +var errSendingSignal = errors.New("sending signal to container") + // KillContainer sends the given signal to the given container. // If all is set, send to all PIDs in the container. // All is only supported if the container created cgroups. @@ -370,15 +372,7 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) } if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { - // Update container state - there's a chance we failed because - // the container exited in the meantime. - if err2 := r.UpdateContainerStatus(ctr); err2 != nil { - logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) - } - if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State) - } - return fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err) + return fmt.Errorf("%w %s: %v", errSendingSignal, ctr.ID(), err) } return nil @@ -407,15 +401,14 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) if timeout > 0 { if err := r.KillContainer(ctr, stopSignal, all); err != nil { - // Is the container gone? - // If so, it probably died between the first check and - // our sending the signal - // The container is stopped, so exit cleanly - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { + if !errors.Is(err, errSendingSignal) { + return err + } + // Maybe sending the signal has failed because the + // container is already gone. + if goneErr := unix.Kill(ctr.state.PID, 0); goneErr == unix.ESRCH { return nil } - return err }