Skip to content

Commit

Permalink
fix deadlock between play kube and cleanup
Browse files Browse the repository at this point in the history
There was a deadlock between two concurrent processes: play kube and cleanup,
that is called after container exit when RestartPolicy is used. Before the fix,
the cleanup command didn't lock Pod's lock, so there was a possibility of
obtaining two locks in different order in two processes.

[NO NEW TESTS NEEDED]

Closes #14921

Signed-off-by: Mikhail Khachayants <[email protected]>
  • Loading branch information
tyler92 committed Jul 19, 2022
1 parent 252fc7c commit e7dfdba
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
29 changes: 26 additions & 3 deletions libpod/container_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,9 +663,9 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
return result.code, result.err
}

// Cleanup unmounts all mount points in container and cleans up container storage
// It also cleans up the network stack
func (c *Container) Cleanup(ctx context.Context) error {
// cleanupLocked unmounts all mount points in container and cleans up container storage
// It also cleans up the network stack. The caller must hold pod.lock.
func (c *Container) cleanupLocked(ctx context.Context) error {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
Expand Down Expand Up @@ -714,6 +714,29 @@ func (c *Container) Cleanup(ctx context.Context) error {
return c.cleanup(ctx)
}

// Cleanup unmounts all mount points in container and cleans up container storage
// It also cleans up the network stack
func (c *Container) Cleanup(ctx context.Context) error {
// We need to lock the pod before we lock the container.
// To avoid races around cleaning up a container and the pod it is in.
if c.config.Pod != "" {
pod, err := c.runtime.state.Pod(c.config.Pod)
if err != nil {
return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), pod.ID(), err)
}

// Lock the pod while we're cleaning up container
if pod.config.LockID == c.config.LockID {
return fmt.Errorf("container %s and pod %s share lock ID %d: %w", c.ID(), pod.ID(), c.config.LockID, define.ErrWillDeadlock)
}

pod.lock.Lock()
defer pod.lock.Unlock()
}

return c.cleanupLocked(ctx)
}

// Batch starts a batch operation on the given container
// All commands in the passed function will execute under the same lock and
// without synchronizing state after each operation
Expand Down
4 changes: 2 additions & 2 deletions libpod/pod_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
}

if cleanup {
return c.Cleanup(ctx)
return c.cleanupLocked(ctx)
}

return nil
Expand Down Expand Up @@ -286,7 +286,7 @@ func (p *Pod) Cleanup(ctx context.Context) (map[string]error, error) {
c := ctr
logrus.Debugf("Adding parallel job to clean up container %s", c.ID())
retChan := parallel.Enqueue(ctx, func() error {
return c.Cleanup(ctx)
return c.cleanupLocked(ctx)
})

ctrErrChan[c.ID()] = retChan
Expand Down

0 comments on commit e7dfdba

Please sign in to comment.