Skip to content

Commit

Permalink
kill: resync the container if runtime fails
Browse files Browse the repository at this point in the history
If the runtime fails to kill the container there is fair chance that the
container has transitionted to another state or been removed already.
Take the lock and resync the container to check for that to prevent
reading old and potentially outdated state.

[NO NEW TESTS NEEDED]

Signed-off-by: Valentin Rothberg <[email protected]>
  • Loading branch information
vrothberg committed Oct 27, 2022
1 parent 47bcd10 commit 5e53178
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 17 deletions.
10 changes: 9 additions & 1 deletion libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -1307,7 +1307,15 @@ func (c *Container) stop(timeout uint) error {

// We have to check stopErr *after* we lock again - otherwise, we have a
// change of panicking on a double-unlock. Ref: GH Issue 9615
if stopErr != nil {
if errors.Is(stopErr, errSendingSignal) {
// Maybe sending the signal has failed because the container
// exited after unlocking above? So let's check the state.
if !c.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
// Return the error unless the container has been
// stopped or exited.
return stopErr
}
} else if stopErr != nil {
return stopErr
}

Expand Down
25 changes: 9 additions & 16 deletions libpod/oci_conmon_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,8 @@ func generateResourceFile(res *spec.LinuxResources) (string, []string, error) {
return f.Name(), flags, nil
}

var errSendingSignal = errors.New("sending signal to container")

// KillContainer sends the given signal to the given container.
// If all is set, send to all PIDs in the container.
// All is only supported if the container created cgroups.
Expand All @@ -370,15 +372,7 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool)
args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal))
}
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil {
// Update container state - there's a chance we failed because
// the container exited in the meantime.
if err2 := r.UpdateContainerStatus(ctr); err2 != nil {
logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2)
}
if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
return fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State)
}
return fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err)
return fmt.Errorf("%w %s: %v", errSendingSignal, ctr.ID(), err)
}

return nil
Expand Down Expand Up @@ -407,15 +401,14 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool)

if timeout > 0 {
if err := r.KillContainer(ctr, stopSignal, all); err != nil {
// Is the container gone?
// If so, it probably died between the first check and
// our sending the signal
// The container is stopped, so exit cleanly
err := unix.Kill(ctr.state.PID, 0)
if err == unix.ESRCH {
if !errors.Is(err, errSendingSignal) {
return err
}
// Maybe sending the signal has failed because the
// container is already gone.
if goneErr := unix.Kill(ctr.state.PID, 0); goneErr == unix.ESRCH {
return nil
}

return err
}

Expand Down

0 comments on commit 5e53178

Please sign in to comment.