fix deadlock between play kube and cleanup

There was a deadlock between two concurrent processes: play kube and cleanup, that is called after container exit when RestartPolicy is used. Before the fix, the cleanup command didn't lock Pod's lock, so there was a possibility of obtaining two locks in different order in two processes. [NO NEW TESTS NEEDED] Closes containers#14921 Signed-off-by: Mikhail Khachayants <[email protected]>
tyler92 · Jul 19, 2022 · 5b1172b · 5b1172b
1 parent 252fc7c
commit 5b1172b
Showing 1 changed file with 17 additions and 0 deletions.
diff --git a/libpod/container_api.go b/libpod/container_api.go
@@ -667,7 +667,24 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
 // It also cleans up the network stack
 func (c *Container) Cleanup(ctx context.Context) error {
 	if !c.batched {
+		// We need to lock the pod before we lock the container.
+		// To avoid races around cleaning up a container and the pod it is in.
+		if c.config.Pod != "" {
+			pod, err := c.runtime.state.Pod(c.config.Pod)
+			if err != nil {
+				return errors.Wrapf(err, "container %s is in pod %s, but pod cannot be retrieved", c.ID(), pod.ID())
+			}
+
+			// Lock the pod while we're cleaning up container
+			if pod.config.LockID == c.config.LockID {
+				return errors.Wrapf(define.ErrWillDeadlock, "container %s and pod %s share lock ID %d", c.ID(), pod.ID(), c.config.LockID)
+			}
+			pod.lock.Lock()
+			defer pod.lock.Unlock()
+		}
+
 		c.lock.Lock()
+
 		defer c.lock.Unlock()
 
 		if err := c.syncContainer(); err != nil {