From 6f919af78ba19c63e005888276dbec635e38cc23 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Mon, 16 Jan 2023 11:45:19 +0100 Subject: [PATCH 1/2] add a comment to container removal Every time I look at a container-removal issue I wonder why the container isn't locked directly here, so let's add a comment here. I am not sure whether I would be better if callers took care of locking but for now the comment will safe the future me and probably other readers some time. [NO NEW TESTS NEEDED] Signed-off-by: Valentin Rothberg --- libpod/runtime_ctr.go | 1 + 1 file changed, 1 insertion(+) diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 4d13d7ffb1..15efdf07be 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -608,6 +608,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai // be removed also if and only if the container is the sole user // Otherwise, RemoveContainer will return an error if the container is running func (r *Runtime) RemoveContainer(ctx context.Context, c *Container, force bool, removeVolume bool, timeout *uint) error { + // NOTE: container will be locked down the road. return r.removeContainer(ctx, c, force, removeVolume, false, false, timeout) } From 067442b5701f0c0759c2da09c298d72522381e70 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Mon, 16 Jan 2023 13:47:30 +0100 Subject: [PATCH 2/2] container kill: handle stopped/exited container The container lock is released before stopping/killing which implies certain race conditions with, for instance, the cleanup process changing the container state to stopped, exited or other states. The (remaining) flakes seen in #16142 and #15367 strongly indicate a race in between the stopping/killing a container and the cleanup process. To fix the flake make sure to ignore invalid-state errors. An alternative fix would be to change `KillContainer` to not return such errors at all but commit c77691f06f61 indicates an explicit desire to have these errors being reported in the sig proxy. [NO NEW TESTS NEEDED] as it's a race already covered by the system tests. Fixes: #16142 Fixes: #15367 Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index b2414dd6ea..dc370928fa 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -429,15 +429,20 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) } if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { - // If the PID is 0, then the container is already stopped. - if ctr.state.PID == 0 { - return nil - } - // Again, check if the container is gone. If it is, exit cleanly. - if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { - return nil + // Ignore the error if KillContainer complains about it already + // being stopped or exited. There's an inherent race with the + // cleanup process (see #16142). + if !(errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited)) { + // If the PID is 0, then the container is already stopped. + if ctr.state.PID == 0 { + return nil + } + // Again, check if the container is gone. If it is, exit cleanly. + if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { + return nil + } + return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) } - return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) } // Give runtime a few seconds to make it happen