From c34b5be9902a7cbdd0445ad3f7f29023b4f3e6d2 Mon Sep 17 00:00:00 2001
From: Alexander Larsson <alexl@redhat.com>
Date: Fri, 7 Oct 2022 17:23:00 +0200
Subject: [PATCH] Avoid unnecessary timeout of 250msec when waiting on
 container shutdown

When you run "podman run foo" we attach to the container, which essentially
blocks until the container process exits. When that happens podman immediately
calls Container.WaitForExit(), but at this point the exit value has not
yet been written to the db by conmon. This means that we almost always
hit the "check for exit state; sleep 250msec" loop in WaitForExit(),
delaying the exit of podman run by 250 msec.

More recent kernels (>= 5.3) supports the pidfd_open() syscall, that
lets you open a fd representing a pid and then poll on it to wait
until the process exits. We can use this to have the first sleep
be exactly as long as is needed for conmon to exit (if we know its pid).
If for whatever reason there is still issues we use the old sleep loop
on later iterations.

This makes "time podman run fedora true" about 200msec faster.

[NO NEW TESTS NEEDED]

Signed-off-by: Alexander Larsson <alexl@redhat.com>
---
 libpod/container_api.go | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/libpod/container_api.go b/libpod/container_api.go
index be0ca01282..61a7d66efb 100644
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@@ -17,6 +17,7 @@ import (
 	"github.com/containers/storage/pkg/archive"
 	spec "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
 )
 
 // Init creates a container in the OCI runtime, moving a container from
@@ -515,6 +516,22 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 	var conmonTimer time.Timer
 	conmonTimerSet := false
 
+	conmonPidFd := -1
+	conmonPidFdTriggered := false
+
+	if c.state.ConmonPID != 0 {
+		// Track lifetime of conmon precisely using pidfd_open + poll.
+		// There are many cases for this to fail, for instance conmon is dead
+		// or pidfd_open is not supported (pre linux 5.3), so fall back to the
+		// traditional loop with poll + sleep
+		if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
+			conmonPidFd = fd
+			defer unix.Close(conmonPidFd)
+		} else if err != unix.ENOSYS && err != unix.ESRCH {
+			logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
+		}
+	}
+
 	getExitCode := func() (bool, int32, error) {
 		containerRemoved := false
 		if !c.batched {
@@ -582,7 +599,18 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 		case <-ctx.Done():
 			return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
 		default:
-			time.Sleep(pollInterval)
+			if conmonPidFd != -1 && !conmonPidFdTriggered {
+				// If possible (pidfd works), the first cycle we block until conmon dies
+				// If this happens, and we fall back to the old poll delay
+				// There is a deadlock in the cleanup code for "play kube" which causes
+				// conmon to not exit, so unfortunately we have to use the poll interval
+				// timeout here to avoid hanging.
+				fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
+				_, _ = unix.Poll(fds, int(pollInterval.Milliseconds()))
+				conmonPidFdTriggered = true
+			} else {
+				time.Sleep(pollInterval)
+			}
 		}
 	}
 }