From edb64f8a768e28352e43578fcf2ad8c209689ea9 Mon Sep 17 00:00:00 2001
From: Paul Holzinger <pholzing@redhat.com>
Date: Wed, 19 Apr 2023 11:58:33 +0200
Subject: [PATCH] libpod: stop containers with --restart=always

Commit 1ab833fb73 improved the situation but it is still not enough.
If you run short lived containers with --restart=always podman is
basically permanently restarting them. To only way to stop this is
podman stop. However podman stop does not do anything when the
container is already in a not running state. While this makes sense we
should still mark the container as explicitly stopped by the user.

Together with the change in shouldRestart() which now checks for
StoppedByUser this makes sure the cleanup process is not going to start
it back up again.

A simple reproducer is:
```
podman run --restart=always --name test -d alpine true
podman stop test
```
then check if the container is still running, the behavior is very
flaky, it took me like 20 podman stop tries before I finally hit the
correct window were it was stopped permanently.
With this patch it worked on the first try.

Fixes #18259

[NO NEW TESTS NEEDED] This is super flaky and hard to correctly test
in CI. MY ginkgo v2 work seems to trigger this in play kube tests so
that should catch at least some regressions. Also this may be something
that should be tested at podman test days by users (#17912).

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
---
 libpod/container_api.go      |  8 -------
 libpod/container_internal.go | 42 ++++++++++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/libpod/container_api.go b/libpod/container_api.go
index 08c2557105..f928e51667 100644
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@@ -254,14 +254,6 @@ func (c *Container) StopWithTimeout(timeout uint) (finalErr error) {
 		}
 	}
 
-	if c.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
-		return define.ErrCtrStopped
-	}
-
-	if !c.ensureState(define.ContainerStateCreated, define.ContainerStateRunning, define.ContainerStateStopping) {
-		return fmt.Errorf("can only stop created or running containers. %s is in state %s: %w", c.ID(), c.state.State.String(), define.ErrCtrStateInvalid)
-	}
-
 	return c.stop(timeout)
 }
 
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index f8b0e5fe3a..3cc3c735a2 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -238,6 +238,11 @@ func (c *Container) shouldRestart() bool {
 		}
 	}
 
+	// Explicitly stopped by user, do not restart again.
+	if c.state.StoppedByUser {
+		return false
+	}
+
 	// If we did not get a restart policy match, return false
 	// Do the same if we're not a policy that restarts.
 	if !c.state.RestartPolicyMatch ||
@@ -1307,15 +1312,38 @@ func (c *Container) stop(timeout uint) error {
 		}
 	}
 
-	// Set the container state to "stopping" and unlock the container
-	// before handing it over to conmon to unblock other commands.  #8501
-	// demonstrates nicely that a high stop timeout will block even simple
-	// commands such as `podman ps` from progressing if the container lock
-	// is held when busy-waiting for the container to be stopped.
+	// OK, the following code looks a bit weird but we have to make sure we can stop
+	// containers with the restart policy always, to do this we have to set
+	// StoppedByUser even when there is nothing to stop right now. This is due to the
+	// cleanup process waiting on the container lock and then afterwards restarts it.
+	// shouldRestart() then checks for StoppedByUser and does not restart it.
+	// https://github.com/containers/podman/issues/18259
+	var cannotStopErr error
+	if c.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
+		cannotStopErr = define.ErrCtrStopped
+	} else if !c.ensureState(define.ContainerStateCreated, define.ContainerStateRunning, define.ContainerStateStopping) {
+		cannotStopErr = fmt.Errorf("can only stop created or running containers. %s is in state %s: %w", c.ID(), c.state.State.String(), define.ErrCtrStateInvalid)
+	}
+
 	c.state.StoppedByUser = true
-	c.state.State = define.ContainerStateStopping
+	if cannotStopErr == nil {
+		// Set the container state to "stopping" and unlock the container
+		// before handing it over to conmon to unblock other commands.  #8501
+		// demonstrates nicely that a high stop timeout will block even simple
+		// commands such as `podman ps` from progressing if the container lock
+		// is held when busy-waiting for the container to be stopped.
+		c.state.State = define.ContainerStateStopping
+	}
 	if err := c.save(); err != nil {
-		return fmt.Errorf("saving container %s state before stopping: %w", c.ID(), err)
+		rErr := fmt.Errorf("saving container %s state before stopping: %w", c.ID(), err)
+		if cannotStopErr == nil {
+			return rErr
+		}
+		// we return below with cannotStopErr
+		logrus.Error(rErr)
+	}
+	if cannotStopErr != nil {
+		return cannotStopErr
 	}
 	if !c.batched {
 		c.lock.Unlock()