libpod: simplify WaitForExit()

The current code did several complicated state checks that simply do not work properly on a fast restarting container. It uses a special case for --restart=always but forgot to take care of --restart=on-failure which always hang for 20s until it run into the timeout. The old logic also used to call CheckConmonRunning() but synced the state before which means it may check a new conmon every time and thus misses exits. To fix the new the code is much simpler. Check the conmon pid, if it is no longer running then get then check exit file and get exit code. This is related to #23473 but I am not sure if this fixes it because we cannot reproduce. Signed-off-by: Paul Holzinger <[email protected]>
containers · Aug 15, 2024 · 8a94331 · 8a94331
1 parent f456b53
commit 8a94331
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 106 deletions.
diff --git a/libpod/container_api.go b/libpod/container_api.go
@@ -541,119 +541,97 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 		}
 		return -1, define.ErrCtrRemoved
 	}
-	var conmonTimer time.Timer
-	conmonTimerSet := false
 
-	conmonPidFd := c.getConmonPidFd()
-	if conmonPidFd != -1 {
-		defer unix.Close(conmonPidFd)
-	}
-	conmonPidFdTriggered := false
-
-	getExitCode := func() (bool, int32, error) {
-		containerRemoved := false
-		if !c.batched {
-			c.lock.Lock()
-			defer c.lock.Unlock()
-		}
-
-		if err := c.syncContainer(); err != nil {
-			if !errors.Is(err, define.ErrNoSuchCtr) {
-				return false, -1, err
-			}
-			containerRemoved = true
-		}
-
-		// If conmon is not alive anymore set a timer to make sure
-		// we're returning even if conmon has forcefully been killed.
-		if !conmonTimerSet && !containerRemoved {
-			conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
-			switch {
-			case errors.Is(err, define.ErrNoSuchCtr):
-				// Container has been removed, so we assume the
-				// exit code is present in the DB.
-				containerRemoved = true
-			case err != nil:
-				return false, -1, err
-			case !conmonAlive:
-				// Give the exit code at most 20 seconds to
-				// show up in the DB.  That should largely be
-				// enough for the cleanup process.
-				timerDuration := time.Second * 20
-				conmonTimer = *time.NewTimer(timerDuration)
-				conmonTimerSet = true
-			case conmonAlive:
-				// Continue waiting if conmon's still running.
-				return false, -1, nil
-			}
-		}
+	if !c.batched {
+		c.lock.Lock()
+		defer c.lock.Unlock()
 
-		timedout := ""
-		if !containerRemoved {
-			// If conmon is dead for more than $timerDuration or if the
-			// container has exited properly, try to look up the exit code.
-			select {
-			case <-conmonTimer.C:
-				logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
-				timedout = " [exceeded conmon timeout waiting for container to exit]"
-			default:
-				switch c.state.State {
-				case define.ContainerStateExited, define.ContainerStateConfigured:
-					// Container exited, so we can look up the exit code.
-				case define.ContainerStateStopped:
-					// Continue looping unless the restart policy is always.
-					// In this case, the container would never transition to
-					// the exited state, so we need to look up the exit code.
-					if c.config.RestartPolicy != define.RestartPolicyAlways {
-						return false, -1, nil
-					}
-				default:
-					// Continue looping
-					return false, -1, nil
+		// Note this is one of the rare cases where we do not like to use syncContainer() as it does read the exit file
+		// We like to avoid that here because that means we might read it before container cleanup run and possible
+		// removed the container
+		if err := c.runtime.state.UpdateContainer(c); err != nil {
+			if errors.Is(err, define.ErrNoSuchCtr) {
+				// if the container is not valid at this point as it was deleted,
+				// check if the exit code was recorded in the db.
+				exitCode, err := c.runtime.state.GetContainerExitCode(id)
+				if err == nil {
+					return exitCode, nil
 				}
 			}
+			return -1, err
 		}
+	}
 
+	conmonPID := c.state.ConmonPID
+	// conmonPID == 0 means container is not running
+	if conmonPID == 0 {
 		exitCode, err := c.runtime.state.GetContainerExitCode(id)
 		if err != nil {
 			if errors.Is(err, define.ErrNoSuchExitCode) {
 				// If the container is configured or created, we must assume it never ran.
 				if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
-					return true, 0, nil
+					return 0, nil
 				}
 			}
-			return true, -1, fmt.Errorf("%w (container in state %s)%s", err, c.state.State, timedout)
+			return -1, err
 		}
+		return exitCode, nil
+	}
 
-		return true, exitCode, nil
+	conmonPidFd := c.getConmonPidFd()
+	if conmonPidFd > -1 {
+		defer unix.Close(conmonPidFd)
 	}
 
-	for {
-		hasExited, exitCode, err := getExitCode()
-		if hasExited {
-			return exitCode, err
+	// we cannot wait locked as we would hold the lock forever, so we unlock and then lock again
+	c.lock.Unlock()
+	err := waitForConmonExit(conmonPID, conmonPidFd, pollInterval)
+	c.lock.Lock()
+	if err != nil {
+		return -1, fmt.Errorf("failed to wait for conmon to exit: %w", err)
+	}
+
+	// we locked again so we must sync the state
+	if err := c.syncContainer(); err != nil {
+		if errors.Is(err, define.ErrNoSuchCtr) {
+			// if the container is not valid at this point as it was deleted,
+			// check if the exit code was recorded in the db.
+			exitCode, err := c.runtime.state.GetContainerExitCode(id)
+			if err == nil {
+				return exitCode, nil
+			}
 		}
-		if err != nil {
-			return -1, err
+		return -1, err
+	}
+
+	// syncContainer already did the exit file read so nothing extra for us to do here
+	return c.runtime.state.GetContainerExitCode(id)
+}
+
+func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) error {
+	if conmonPidFd > -1 {
+		for {
+			fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
+			if _, err := unix.Poll(fds, -1); err != nil {
+				if err == unix.EINTR {
+					continue
+				}
+				return err
+			}
+			return nil
 		}
-		select {
-		case <-ctx.Done():
-			return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
-		default:
-			if conmonPidFd != -1 && !conmonPidFdTriggered {
-				// If possible (pidfd works), the first cycle we block until conmon dies
-				// If this happens, and we fall back to the old poll delay
-				// There is a deadlock in the cleanup code for "play kube" which causes
-				// conmon to not exit, so unfortunately we have to use the poll interval
-				// timeout here to avoid hanging.
-				fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
-				_, _ = unix.Poll(fds, int(pollInterval.Milliseconds()))
-				conmonPidFdTriggered = true
-			} else {
-				time.Sleep(pollInterval)
+	}
+	// no pidfd support, we must poll the pid
+	for {
+		if err := unix.Kill(conmonPID, 0); err != nil {
+			if err == unix.ESRCH {
+				break
 			}
+			return err
 		}
+		time.Sleep(pollInterval)
 	}
+	return nil
 }
 
 type waitResult struct {

diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go
@@ -694,16 +694,14 @@ func (c *Container) makePlatformBindMounts() error {
 }
 
 func (c *Container) getConmonPidFd() int {
-	if c.state.ConmonPID != 0 {
-		// Track lifetime of conmon precisely using pidfd_open + poll.
-		// There are many cases for this to fail, for instance conmon is dead
-		// or pidfd_open is not supported (pre linux 5.3), so fall back to the
-		// traditional loop with poll + sleep
-		if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
-			return fd
-		} else if err != unix.ENOSYS && err != unix.ESRCH {
-			logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
-		}
+	// Track lifetime of conmon precisely using pidfd_open + poll.
+	// There are many cases for this to fail, for instance conmon is dead
+	// or pidfd_open is not supported (pre linux 5.3), so fall back to the
+	// traditional loop with poll + sleep
+	if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
+		return fd
+	} else if err != unix.ENOSYS && err != unix.ESRCH {
+		logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
 	}
 	return -1
 }

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -1289,13 +1289,15 @@ EOF
     rm -rf $romount
 }
 
-@test "podman run --restart=always -- wait" {
+@test "podman run --restart=always/on-failure -- wait" {
     # regression test for #18572 to make sure Podman waits less than 20 seconds
     ctr=c_$(safename)
-    run_podman run -d --restart=always --name=$ctr $IMAGE false
-    PODMAN_TIMEOUT=20 run_podman wait $ctr
-    is "$output" "1" "container should exit 1"
-    run_podman rm -f -t0 $ctr
+    for policy in always on-failure; do
+        run_podman run -d --restart=$policy --name=$ctr $IMAGE false
+        PODMAN_TIMEOUT=20 run_podman wait $ctr
+        is "$output" "1" "container should exit 1 (policy: $policy)"
+        run_podman rm -f -t0 $ctr
+    done
 }
 
 @test "podman run - custom static_dir" {