From e8b35a8c20a68c852a5819dde39dc077acc11159 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Wed, 18 Jan 2023 13:36:25 +0100 Subject: [PATCH 1/5] waitPidStop: simplify code The code can be simplified by using a timer directly. [NO NEW TESTS NEEDED] - should not change behavior. Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index dc370928fa..5ab1a335d8 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -949,31 +949,20 @@ func waitContainerStop(ctr *Container, timeout time.Duration) error { // Wait for a given PID to stop func waitPidStop(pid int, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - close(done) - return - } - logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) + timer := time.NewTimer(timeout) + for { + select { + case <-timer.C: + return fmt.Errorf("given PID did not die within timeout") + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + return nil } - time.Sleep(100 * time.Millisecond) + logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) } + time.Sleep(100 * time.Millisecond) } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - return fmt.Errorf("given PIDs did not die within timeout") } } From ac47d07194c3d4163524047231d3869dc3324349 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Thu, 19 Jan 2023 10:57:31 +0100 Subject: [PATCH 2/5] StopContainer: small refactor Move the stopSignal decl into the branch where it's actually used. [NO NEW TESTS NEEDED] as it's just a small refactor. Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 5ab1a335d8..9b95772b83 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -400,12 +400,11 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) return nil } - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) - } - if timeout > 0 { + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } if err := r.KillContainer(ctr, stopSignal, all); err != nil { // Is the container gone? // If so, it probably died between the first check and From e0f671007d82f81a7a1236ceabb90c58007133c3 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Thu, 19 Jan 2023 10:59:23 +0100 Subject: [PATCH 3/5] StopSignal: add a comment Add a comment when SIGKILL is being used. It may help future readers better comprehend what's going on and why. [NO NEW TESTS NEEDED] - cannot test a comment :^) Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 9b95772b83..23ee52cf8a 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -427,6 +427,8 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) } } + // If the timeout was set to 0 or if stopping the container with the + // specified signal did not work, use the big hammer with SIGKILL. if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { // Ignore the error if KillContainer complains about it already // being stopped or exited. There's an inherent race with the From fd42c1dcb8d7cc86e6e62c7628a6093573a9ed36 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Thu, 19 Jan 2023 11:03:23 +0100 Subject: [PATCH 4/5] StopContainer: return if cleanup process changed state Commit 067442b5701f improved stopping/killing a container by detecting whether the cleanup process has already fired and changed the state of the container. Further improve on that by returning early instead of trying to wait for the PID to finish. At that point we know that the container has exited but the previous PID may have been recycled already by the kernel. [NO NEW TESTS NEEDED] - the absence of the two flaking tests recorded in #17142 will tell. Fixes: #17142 Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 23ee52cf8a..c11787ced3 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -430,20 +430,23 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) // If the timeout was set to 0 or if stopping the container with the // specified signal did not work, use the big hammer with SIGKILL. if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { - // Ignore the error if KillContainer complains about it already - // being stopped or exited. There's an inherent race with the - // cleanup process (see #16142). - if !(errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited)) { - // If the PID is 0, then the container is already stopped. - if ctr.state.PID == 0 { - return nil - } - // Again, check if the container is gone. If it is, exit cleanly. - if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { - return nil - } - return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) + // There's an inherent race with the cleanup process (see + // #16142, #17142). If the container has already been marked as + // stopped or exited by the cleanup process, we can return + // immediately. + if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return nil + } + + // If the PID is 0, then the container is already stopped. + if ctr.state.PID == 0 { + return nil + } + // Again, check if the container is gone. If it is, exit cleanly. + if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { + return nil } + return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) } // Give runtime a few seconds to make it happen From 4faa139b7873932ec6ef9614371eb4376f9e5425 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Thu, 19 Jan 2023 12:31:37 +0100 Subject: [PATCH 5/5] waitPidStop: reduce sleep time to 10ms Kill is a fast syscall, so we can reduce the sleep time from 100ms to 10ms in hope to speed things up a bit. [NO NEW TESTS NEEDED] Signed-off-by: Valentin Rothberg --- libpod/oci_conmon_common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index c11787ced3..6dd54a8bc6 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -965,7 +965,7 @@ func waitPidStop(pid int, timeout time.Duration) error { } logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) } - time.Sleep(100 * time.Millisecond) + time.Sleep(10 * time.Millisecond) } } }