diff --git a/docs/source/markdown/podman-wait.1.md.in b/docs/source/markdown/podman-wait.1.md.in index 75376f6b36..218ca04ba1 100644 --- a/docs/source/markdown/podman-wait.1.md.in +++ b/docs/source/markdown/podman-wait.1.md.in @@ -16,10 +16,13 @@ separated by newline in the same order as they were given to the command. An exit code of -1 is emitted for all conditions other than "stopped" and "exited". -NOTE: there is an inherent race condition when waiting for containers with a -restart policy of `always` or `on-failure`, such as those created by `podman -kube play`. Such containers may be repeatedly exiting and restarting, possibly -with different exit codes, but `podman wait` can only display and detect one. +When waiting for containers with a restart policy of `always` or `on-failure`, +such as those created by `podman kube play`, the containers may be repeatedly +exiting and restarting, possibly with different exit codes. `podman wait` will +only display and detect the first exit after the wait command was started. + +When running a container with podman run --rm wait should wait for the +container to be fully removed as well before it returns. ## OPTIONS diff --git a/libpod/container_api.go b/libpod/container_api.go index 300fe5ca20..1efa82800c 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -541,119 +541,97 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) } return -1, define.ErrCtrRemoved } - var conmonTimer time.Timer - conmonTimerSet := false - conmonPidFd := c.getConmonPidFd() - if conmonPidFd != -1 { - defer unix.Close(conmonPidFd) - } - conmonPidFdTriggered := false - - getExitCode := func() (bool, int32, error) { - containerRemoved := false - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - - if err := c.syncContainer(); err != nil { - if !errors.Is(err, define.ErrNoSuchCtr) { - return false, -1, err - } - containerRemoved = true - } - - // If conmon is not alive anymore set a timer to make sure - // we're returning even if conmon has forcefully been killed. - if !conmonTimerSet && !containerRemoved { - conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) - switch { - case errors.Is(err, define.ErrNoSuchCtr): - // Container has been removed, so we assume the - // exit code is present in the DB. - containerRemoved = true - case err != nil: - return false, -1, err - case !conmonAlive: - // Give the exit code at most 20 seconds to - // show up in the DB. That should largely be - // enough for the cleanup process. - timerDuration := time.Second * 20 - conmonTimer = *time.NewTimer(timerDuration) - conmonTimerSet = true - case conmonAlive: - // Continue waiting if conmon's still running. - return false, -1, nil - } - } + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() - timedout := "" - if !containerRemoved { - // If conmon is dead for more than $timerDuration or if the - // container has exited properly, try to look up the exit code. - select { - case <-conmonTimer.C: - logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id) - timedout = " [exceeded conmon timeout waiting for container to exit]" - default: - switch c.state.State { - case define.ContainerStateExited, define.ContainerStateConfigured: - // Container exited, so we can look up the exit code. - case define.ContainerStateStopped: - // Continue looping unless the restart policy is always. - // In this case, the container would never transition to - // the exited state, so we need to look up the exit code. - if c.config.RestartPolicy != define.RestartPolicyAlways { - return false, -1, nil - } - default: - // Continue looping - return false, -1, nil + // Note this is one of the rare cases where we do not like to use syncContainer() as it does read the exit file + // We like to avoid that here because that means we might read it before container cleanup run and possible + // removed the container + if err := c.runtime.state.UpdateContainer(c); err != nil { + if errors.Is(err, define.ErrNoSuchCtr) { + // if the container is not valid at this point as it was deleted, + // check if the exit code was recorded in the db. + exitCode, err := c.runtime.state.GetContainerExitCode(id) + if err == nil { + return exitCode, nil } } + return -1, err } + } + conmonPID := c.state.ConmonPID + // conmonPID == 0 means container is not running + if conmonPID == 0 { exitCode, err := c.runtime.state.GetContainerExitCode(id) if err != nil { if errors.Is(err, define.ErrNoSuchExitCode) { // If the container is configured or created, we must assume it never ran. if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) { - return true, 0, nil + return 0, nil } } - return true, -1, fmt.Errorf("%w (container in state %s)%s", err, c.state.State, timedout) + return -1, err } + return exitCode, nil + } - return true, exitCode, nil + conmonPidFd := c.getConmonPidFd() + if conmonPidFd > -1 { + defer unix.Close(conmonPidFd) } - for { - hasExited, exitCode, err := getExitCode() - if hasExited { - return exitCode, err + // we cannot wait locked as we would hold the lock forever, so we unlock and then lock again + c.lock.Unlock() + err := waitForConmonExit(conmonPID, conmonPidFd, pollInterval) + c.lock.Lock() + if err != nil { + return -1, fmt.Errorf("failed to wait for conmon to exit: %w", err) + } + + // we locked again so we must sync the state + if err := c.syncContainer(); err != nil { + if errors.Is(err, define.ErrNoSuchCtr) { + // if the container is not valid at this point as it was deleted, + // check if the exit code was recorded in the db. + exitCode, err := c.runtime.state.GetContainerExitCode(id) + if err == nil { + return exitCode, nil + } } - if err != nil { - return -1, err + return -1, err + } + + // syncContainer already did the exit file read so nothing extra for us to do here + return c.runtime.state.GetContainerExitCode(id) +} + +func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) error { + if conmonPidFd > -1 { + for { + fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}} + if _, err := unix.Poll(fds, -1); err != nil { + if err == unix.EINTR { + continue + } + return err + } + return nil } - select { - case <-ctx.Done(): - return -1, fmt.Errorf("waiting for exit code of container %s canceled", id) - default: - if conmonPidFd != -1 && !conmonPidFdTriggered { - // If possible (pidfd works), the first cycle we block until conmon dies - // If this happens, and we fall back to the old poll delay - // There is a deadlock in the cleanup code for "play kube" which causes - // conmon to not exit, so unfortunately we have to use the poll interval - // timeout here to avoid hanging. - fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}} - _, _ = unix.Poll(fds, int(pollInterval.Milliseconds())) - conmonPidFdTriggered = true - } else { - time.Sleep(pollInterval) + } + // no pidfd support, we must poll the pid + for { + if err := unix.Kill(conmonPID, 0); err != nil { + if err == unix.ESRCH { + break } + return err } + time.Sleep(pollInterval) } + return nil } type waitResult struct { diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 50acaf3e7f..51a0793bb7 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -2163,6 +2163,11 @@ func (c *Container) stopPodIfNeeded(ctx context.Context) error { return nil } + // Never try to stop the pod when a init container stopped + if c.IsInitCtr() { + return nil + } + pod, err := c.runtime.state.Pod(c.config.Pod) if err != nil { return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 5ea2448e20..e4b3e71406 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -694,16 +694,14 @@ func (c *Container) makePlatformBindMounts() error { } func (c *Container) getConmonPidFd() int { - if c.state.ConmonPID != 0 { - // Track lifetime of conmon precisely using pidfd_open + poll. - // There are many cases for this to fail, for instance conmon is dead - // or pidfd_open is not supported (pre linux 5.3), so fall back to the - // traditional loop with poll + sleep - if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil { - return fd - } else if err != unix.ENOSYS && err != unix.ESRCH { - logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err) - } + // Track lifetime of conmon precisely using pidfd_open + poll. + // There are many cases for this to fail, for instance conmon is dead + // or pidfd_open is not supported (pre linux 5.3), so fall back to the + // traditional loop with poll + sleep + if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil { + return fd + } else if err != unix.ENOSYS && err != unix.ESRCH { + logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err) } return -1 } diff --git a/libpod/runtime_volume.go b/libpod/runtime_volume.go index 6066145f02..ff04eec3cc 100644 --- a/libpod/runtime_volume.go +++ b/libpod/runtime_volume.go @@ -28,13 +28,6 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool, timeo return define.ErrRuntimeStopped } - if !v.valid { - if ok, _ := r.state.HasVolume(v.Name()); !ok { - // Volume probably already removed - // Or was never in the runtime to begin with - return nil - } - } return r.removeVolume(ctx, v, force, timeout, false) } diff --git a/libpod/runtime_volume_common.go b/libpod/runtime_volume_common.go index 44d4f667d7..532ea195a3 100644 --- a/libpod/runtime_volume_common.go +++ b/libpod/runtime_volume_common.go @@ -357,13 +357,11 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo return define.ErrVolumeRemoved } - v.lock.Lock() - defer v.lock.Unlock() - - // Update volume status to pick up a potential removal from state - if err := v.update(); err != nil { - return err - } + // DANGEROUS: Do not lock here yet because we might needed to remove containers first. + // In general we must always acquire the ctr lock before a volume lock so we cannot lock. + // THIS MUST BE DONE to prevent ABBA deadlocks. + // It also means the are several races around creating containers with volumes and removing + // them in parallel. However that problem exists regadless of taking the lock here or not. deps, err := r.state.VolumeInUse(v) if err != nil { @@ -400,6 +398,15 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo } } + // Ok now we are good to lock. + v.lock.Lock() + defer v.lock.Unlock() + + // Update volume status to pick up a potential removal from state + if err := v.update(); err != nil { + return err + } + // If the volume is still mounted - force unmount it if err := v.unmount(true); err != nil { if force { diff --git a/test/system/030-run.bats b/test/system/030-run.bats index ec788642b9..1aa079dab4 100644 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -1289,13 +1289,15 @@ EOF rm -rf $romount } -@test "podman run --restart=always -- wait" { +@test "podman run --restart=always/on-failure -- wait" { # regression test for #18572 to make sure Podman waits less than 20 seconds ctr=c_$(safename) - run_podman run -d --restart=always --name=$ctr $IMAGE false - PODMAN_TIMEOUT=20 run_podman wait $ctr - is "$output" "1" "container should exit 1" - run_podman rm -f -t0 $ctr + for policy in always on-failure; do + run_podman run -d --restart=$policy --name=$ctr $IMAGE false + PODMAN_TIMEOUT=20 run_podman wait $ctr + is "$output" "1" "container should exit 1 (policy: $policy)" + run_podman rm -f -t0 $ctr + done } @test "podman run - custom static_dir" { diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats index 0d21391195..b0151c0a27 100644 --- a/test/system/500-networking.bats +++ b/test/system/500-networking.bats @@ -935,6 +935,27 @@ EOF fi } +function wait_for_restart_count() { + local cname="$1" + local count="$2" + local tname="$3" + + local timeout=10 + while :; do + # Previously this would fail as the container would run out of ips after 5 restarts. + run_podman inspect --format "{{.RestartCount}}" $cname + if [[ "$output" == "$2" ]]; then + break + fi + + timeout=$((timeout - 1)) + if [[ $timeout -eq 0 ]]; then + die "Timed out waiting for RestartCount with $tname" + fi + sleep 0.5 + done +} + # Test for https://github.com/containers/podman/issues/18615 @test "podman network cleanup --userns + --restart" { skip_if_cgroupsv1 "run --uidmap fails on cgroups v1 (issue 15025, wontfix)" @@ -955,10 +976,8 @@ EOF # This will cause 7 containers runs with the restart policy (one more than the on failure limit) # Since they run sequentially they should run fine without allocating all ips. run_podman 1 run --name $cname --network $net1 --restart on-failure:6 --userns keep-id $IMAGE false - - # Previously this would fail as the container would run out of ips after 5 restarts. - run_podman inspect --format "{{.RestartCount}}" $cname - assert "$output" == "6" "RestartCount for failing container with bridge network" + wait_for_restart_count $cname 6 "custom network" + run_podman wait $cname # Now make sure we can still run a container with free ips. run_podman run --rm --network $net1 $IMAGE true @@ -969,23 +988,23 @@ EOF if has_slirp4netns; then cname2=con2-$(random_string 10) run_podman 1 run --name $cname2 --network slirp4netns --restart on-failure:2 --userns keep-id $IMAGE false - run_podman inspect --format "{{.RestartCount}}" $cname2 - assert "$output" == "2" "RestartCount for failing container with slirp4netns" + wait_for_restart_count $cname2 2 "slirp4netns" + run_podman wait $cname2 fi if is_rootless; then # pasta can only run rootless cname3=con3-$(random_string 10) run_podman 1 run --name $cname3 --network pasta --restart on-failure:2 --userns keep-id $IMAGE false - run_podman inspect --format "{{.RestartCount}}" $cname3 - assert "$output" == "2" "RestartCount for failing container with pasta" + wait_for_restart_count $cname3 2 "pasta" + run_podman wait $cname3 else # This is racy if other programs modify /run/netns while the test is running. # However I think the risk is minimal and I think checking for this is important. assert "$(ls /run/netns | wc -l)" == "$netns_count" "/run/netns has no leaked netns files" fi - run_podman rm -f -t0 $cname $cname2 $cname3 + run_podman rm $cname $cname2 $cname3 run_podman network rm $net1 }