From c34b5be9902a7cbdd0445ad3f7f29023b4f3e6d2 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 7 Oct 2022 17:23:00 +0200 Subject: [PATCH] Avoid unnecessary timeout of 250msec when waiting on container shutdown When you run "podman run foo" we attach to the container, which essentially blocks until the container process exits. When that happens podman immediately calls Container.WaitForExit(), but at this point the exit value has not yet been written to the db by conmon. This means that we almost always hit the "check for exit state; sleep 250msec" loop in WaitForExit(), delaying the exit of podman run by 250 msec. More recent kernels (>= 5.3) supports the pidfd_open() syscall, that lets you open a fd representing a pid and then poll on it to wait until the process exits. We can use this to have the first sleep be exactly as long as is needed for conmon to exit (if we know its pid). If for whatever reason there is still issues we use the old sleep loop on later iterations. This makes "time podman run fedora true" about 200msec faster. [NO NEW TESTS NEEDED] Signed-off-by: Alexander Larsson --- libpod/container_api.go | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index be0ca01282..61a7d66efb 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -17,6 +17,7 @@ import ( "github.com/containers/storage/pkg/archive" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) // Init creates a container in the OCI runtime, moving a container from @@ -515,6 +516,22 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) var conmonTimer time.Timer conmonTimerSet := false + conmonPidFd := -1 + conmonPidFdTriggered := false + + if c.state.ConmonPID != 0 { + // Track lifetime of conmon precisely using pidfd_open + poll. + // There are many cases for this to fail, for instance conmon is dead + // or pidfd_open is not supported (pre linux 5.3), so fall back to the + // traditional loop with poll + sleep + if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil { + conmonPidFd = fd + defer unix.Close(conmonPidFd) + } else if err != unix.ENOSYS && err != unix.ESRCH { + logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err) + } + } + getExitCode := func() (bool, int32, error) { containerRemoved := false if !c.batched { @@ -582,7 +599,18 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) case <-ctx.Done(): return -1, fmt.Errorf("waiting for exit code of container %s canceled", id) default: - time.Sleep(pollInterval) + if conmonPidFd != -1 && !conmonPidFdTriggered { + // If possible (pidfd works), the first cycle we block until conmon dies + // If this happens, and we fall back to the old poll delay + // There is a deadlock in the cleanup code for "play kube" which causes + // conmon to not exit, so unfortunately we have to use the poll interval + // timeout here to avoid hanging. + fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}} + _, _ = unix.Poll(fds, int(pollInterval.Milliseconds())) + conmonPidFdTriggered = true + } else { + time.Sleep(pollInterval) + } } } }