Skip to content

Commit

Permalink
Ensure Conmon is alive before waiting for exit file
Browse files Browse the repository at this point in the history
This came out of a conversation with Valentin about
systemd-managed Podman. He discovered that unit files did not
properly handle cases where Conmon was dead - the ExecStopPost
`podman rm --force` line was not actually removing the container,
but interestingly, adding a `podman cleanup --rm` line would
remove it. Both of these commands do the same thing (minus the
`podman cleanup --rm` command not force-removing running
containers).

Without a running Conmon instance, the container process is still
running (assuming you killed Conmon with SIGKILL and it had no
chance to kill the container it managed), but you can still kill
the container itself with `podman stop` - Conmon is not involved,
only the OCI Runtime. (`podman rm --force` and `podman stop` use
the same code to kill the container). The problem comes when we
want to get the container's exit code - we expect Conmon to make
us an exit file, which it's obviously not going to do, being
dead. The first `podman rm` would fail because of this, but
importantly, it would (after failing to retrieve the exit code
correctly) set container status to Exited, so that the second
`podman cleanup` process would succeed.

To make sure the first `podman rm --force` succeeds, we need to
catch the case where Conmon is already dead, and instead of
waiting for an exit file that will never come, immediately set
the Stopped state and remove an error that can be caught and
handled.

Signed-off-by: Matthew Heon <[email protected]>
  • Loading branch information
mheon committed Jun 8, 2020
1 parent b8acc85 commit 9d964ff
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 4 deletions.
24 changes: 22 additions & 2 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -1209,13 +1209,35 @@ func (c *Container) stop(timeout uint) error {
}
}

// Check if conmon is still alive.
// If it is not, we won't be getting an exit file.
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
if err != nil {
return err
}

if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil {
return err
}

c.newContainerEvent(events.Stop)

c.state.PID = 0
c.state.ConmonPID = 0
c.state.StoppedByUser = true

if !conmonAlive {
// Conmon is dead, so we can't epect an exit code.
c.state.ExitCode = -1
c.state.FinishedTime = time.Now()
c.state.State = define.ContainerStateStopped
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s status: %v", c.ID(), err)
}

return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
}

if err := c.save(); err != nil {
return errors.Wrapf(err, "error saving container %s state after stopping", c.ID())
}
Expand All @@ -1225,8 +1247,6 @@ func (c *Container) stop(timeout uint) error {
return err
}

c.newContainerEvent(events.Stop)

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions libpod/define/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ var (
// ErrConmonOutdated indicates the version of conmon found (whether via the configuration or $PATH)
// is out of date for the current podman version
ErrConmonOutdated = errors.New("outdated conmon version")
// ErrConmonDead indicates that the container's conmon process has been
// killed, preventing normal operation.
ErrConmonDead = errors.New("conmon process killed")

// ErrImageInUse indicates the requested operation failed because the image was in use
ErrImageInUse = errors.New("image is being used")
Expand Down
7 changes: 7 additions & 0 deletions libpod/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,13 @@ type OCIRuntime interface {
// error.
CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error

// CheckConmonRunning verifies that the given container's Conmon
// instance is still running. Runtimes without Conmon, or systems where
// the PID of conmon is not available, should mock this as True.
// True indicates that Conmon for the instance is running, False
// indicates it is not.
CheckConmonRunning(ctr *Container) (bool, error)

// SupportsCheckpoint returns whether this OCI runtime
// implementation supports the CheckpointContainer() operation.
SupportsCheckpoint() bool
Expand Down
25 changes: 25 additions & 0 deletions libpod/oci_conmon_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,31 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
}

func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
if ctr.state.ConmonPID == 0 {
// If the container is running or paused, assume Conmon is
// running. We didn't record Conmon PID on some old versions, so
// that is likely what's going on...
// Unusual enough that we should print a warning message though.
if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
logrus.Warnf("Conmon PID is not set, but container is running!")
return true, nil
}
// Container's not running, so conmon PID being unset is
// expected. Conmon is not running.
return false, nil
}

// We have a conmon PID. Ping it with signal 0.
if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil {
if err == unix.ESRCH {
return false, nil
}
return false, errors.Wrapf(err, "error pinging container %s conmon with signal 0", ctr.ID())
}
return true, nil
}

// SupportsCheckpoint checks if the OCI runtime supports checkpointing
// containers.
func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
Expand Down
5 changes: 5 additions & 0 deletions libpod/oci_missing.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ func (r *MissingRuntime) CheckpointContainer(ctr *Container, options ContainerCh
return r.printError()
}

// CheckConmonRunning is not available as the runtime is missing
func (r *MissingRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
return false, r.printError()
}

// SupportsCheckpoint returns false as checkpointing requires a working runtime
func (r *MissingRuntime) SupportsCheckpoint() bool {
return false
Expand Down
6 changes: 4 additions & 2 deletions libpod/runtime_ctr.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
}
}

// Check that the container's in a good state to be removed
// Check that the container's in a good state to be removed.
if c.state.State == define.ContainerStateRunning {
if err := c.stop(c.StopTimeout()); err != nil {
// Ignore ErrConmonDead - we couldn't retrieve the container's
// exit code properly, but it's still stopped.
if err := c.stop(c.StopTimeout()); err != nil && errors.Cause(err) != define.ErrConmonDead {
return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID())
}
}
Expand Down

0 comments on commit 9d964ff

Please sign in to comment.