Skip to content

Commit

Permalink
Merge pull request #6520 from mheon/no_conmon_no_error
Browse files Browse the repository at this point in the history
Ensure Conmon is alive before waiting for exit file
  • Loading branch information
openshift-merge-robot authored Jun 9, 2020
2 parents a85e979 + 9d964ff commit 79f30af
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 4 deletions.
24 changes: 22 additions & 2 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -1209,13 +1209,35 @@ func (c *Container) stop(timeout uint) error {
}
}

// Check if conmon is still alive.
// If it is not, we won't be getting an exit file.
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
if err != nil {
return err
}

if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil {
return err
}

c.newContainerEvent(events.Stop)

c.state.PID = 0
c.state.ConmonPID = 0
c.state.StoppedByUser = true

if !conmonAlive {
// Conmon is dead, so we can't epect an exit code.
c.state.ExitCode = -1
c.state.FinishedTime = time.Now()
c.state.State = define.ContainerStateStopped
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s status: %v", c.ID(), err)
}

return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
}

if err := c.save(); err != nil {
return errors.Wrapf(err, "error saving container %s state after stopping", c.ID())
}
Expand All @@ -1225,8 +1247,6 @@ func (c *Container) stop(timeout uint) error {
return err
}

c.newContainerEvent(events.Stop)

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions libpod/define/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ var (
// ErrConmonOutdated indicates the version of conmon found (whether via the configuration or $PATH)
// is out of date for the current podman version
ErrConmonOutdated = errors.New("outdated conmon version")
// ErrConmonDead indicates that the container's conmon process has been
// killed, preventing normal operation.
ErrConmonDead = errors.New("conmon process killed")

// ErrImageInUse indicates the requested operation failed because the image was in use
ErrImageInUse = errors.New("image is being used")
Expand Down
7 changes: 7 additions & 0 deletions libpod/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,13 @@ type OCIRuntime interface {
// error.
CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error

// CheckConmonRunning verifies that the given container's Conmon
// instance is still running. Runtimes without Conmon, or systems where
// the PID of conmon is not available, should mock this as True.
// True indicates that Conmon for the instance is running, False
// indicates it is not.
CheckConmonRunning(ctr *Container) (bool, error)

// SupportsCheckpoint returns whether this OCI runtime
// implementation supports the CheckpointContainer() operation.
SupportsCheckpoint() bool
Expand Down
25 changes: 25 additions & 0 deletions libpod/oci_conmon_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,31 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
}

func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
if ctr.state.ConmonPID == 0 {
// If the container is running or paused, assume Conmon is
// running. We didn't record Conmon PID on some old versions, so
// that is likely what's going on...
// Unusual enough that we should print a warning message though.
if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
logrus.Warnf("Conmon PID is not set, but container is running!")
return true, nil
}
// Container's not running, so conmon PID being unset is
// expected. Conmon is not running.
return false, nil
}

// We have a conmon PID. Ping it with signal 0.
if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil {
if err == unix.ESRCH {
return false, nil
}
return false, errors.Wrapf(err, "error pinging container %s conmon with signal 0", ctr.ID())
}
return true, nil
}

// SupportsCheckpoint checks if the OCI runtime supports checkpointing
// containers.
func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
Expand Down
5 changes: 5 additions & 0 deletions libpod/oci_missing.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ func (r *MissingRuntime) CheckpointContainer(ctr *Container, options ContainerCh
return r.printError()
}

// CheckConmonRunning is not available as the runtime is missing
func (r *MissingRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
return false, r.printError()
}

// SupportsCheckpoint returns false as checkpointing requires a working runtime
func (r *MissingRuntime) SupportsCheckpoint() bool {
return false
Expand Down
6 changes: 4 additions & 2 deletions libpod/runtime_ctr.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
}
}

// Check that the container's in a good state to be removed
// Check that the container's in a good state to be removed.
if c.state.State == define.ContainerStateRunning {
if err := c.stop(c.StopTimeout()); err != nil {
// Ignore ErrConmonDead - we couldn't retrieve the container's
// exit code properly, but it's still stopped.
if err := c.stop(c.StopTimeout()); err != nil && errors.Cause(err) != define.ErrConmonDead {
return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID())
}
}
Expand Down

0 comments on commit 79f30af

Please sign in to comment.