From 86360598bd9b0ab1485e2f8e3a0bc1eaa9748afd Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 24 Jan 2024 17:20:41 -0800 Subject: [PATCH] tests/int: fix flaky kill tests It takes some time for the kernel to kill the process (and remove its PID from cgroup.procs). To ensure we don't have flakes from reading cgroup.procs right after the kill, check and wait for processes to actually be gone. Fixes: 4163 Reported-by: lifubang@acmcoder.com Signed-off-by: Kir Kolyshkin --- tests/integration/delete.bats | 20 +++++++++++++------- tests/integration/helpers.bash | 30 ++++++++++++++++++++++++++++++ tests/integration/kill.bats | 27 ++++++++++++++++++--------- 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index a7ade1082ed..bdb94516b9e 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -54,12 +54,13 @@ function test_runc_delete_host_pidns() { # not have own PID ns, its init is no special and the container # will still be up and running. kill -9 "$init_pid" + wait_pids_gone 10 0.2 "$init_pid" # Get the list of all container processes. - pids=$(cat "$cgpath"/cgroup.procs) - echo "pids: $pids" + mapfile -t pids < <(cat "$cgpath"/cgroup.procs) + echo "pids:" "${pids[@]}" # Sanity check -- make sure all processes exist. - for p in $pids; do + for p in "${pids[@]}"; do kill -0 "$p" done @@ -70,10 +71,15 @@ function test_runc_delete_host_pidns() { runc state test_busybox [ "$status" -ne 0 ] # "Container does not exist" - # Make sure all processes are gone. - pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone - echo "pids: $pids" - [ -z "$pids" ] + # Wait and check that all the processes are gone. + wait_pids_gone 10 0.2 "${pids[@]}" + + # Make sure cgroup.procs is empty. + mapfile -t pids < <(cat "$cgpath"/cgroup.procs || true) + if [ ${#pids[@]} -gt 0 ]; then + echo "expected empty cgroup.procs, got:" "${pids[@]}" 1>&2 + return 1 + fi } @test "runc delete" { diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 85e1113c46e..52525408484 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -581,6 +581,36 @@ function testcontainer() { [[ "${output}" == *"$2"* ]] } +# Check that all the listed processes are gone. Use after kill/stop etc. +function wait_pids_gone() { + if [ $# -lt 3 ]; then + echo "Usage: wait_pids_gone ITERATIONS SLEEP PID [PID ...]" + return 1 + fi + local iter=$1 + shift + local sleep=$1 + shift + local pids=("$@") + + while true; do + for i in "${!pids[@]}"; do + # Check if the pid is there; if not, remove it from the list. + kill -0 "${pids[i]}" 2>/dev/null || unset "pids[i]" + done + [ ${#pids[@]} -eq 0 ] && return 0 + # Rebuild pids array to avoid sparse array issues. + pids=("${pids[@]}") + + ((--iter > 0)) || break + + sleep "$sleep" + done + + echo "Expected all PIDs to be gone, but some are still there:" "${pids[@]}" 1>&2 + return 1 +} + function setup_recvtty() { [ ! -v ROOT ] && return 1 # must not be called without ROOT set local dir="$ROOT/tty" diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index d1e0de19dca..4d3208be0bc 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -46,24 +46,33 @@ test_host_pidns_kill() { # kills the container; see "kill KILL [host pidns + init gone]" # below). kill -9 "$init_pid" + wait_pids_gone 10 0.2 "$init_pid" fi # Get the list of all container processes. - pids=$(cat "$cgpath"/cgroup.procs) - echo "pids: $pids" + mapfile -t pids < <(cat "$cgpath"/cgroup.procs) + echo "pids:" "${pids[@]}" # Sanity check -- make sure all processes exist. - for p in $pids; do + for p in "${pids[@]}"; do kill -0 "$p" done runc kill test_busybox KILL [ "$status" -eq 0 ] - wait_for_container 10 1 test_busybox stopped - - # Make sure all processes are gone. - pids=$(cat "$cgpath"/cgroup.procs) || true # OK if cgroup is gone - echo "pids: $pids" - [ -z "$pids" ] + # Wait and check that all processes are gone. + wait_pids_gone 10 0.2 "${pids[@]}" + + # Make sure the container is in stopped state. Note if KILL_INIT + # is set, container was already stopped by killing its $init_pid + # and so this check is NOP/redundant. + testcontainer test_busybox stopped + + # Make sure cgroup.procs is empty. + mapfile -t pids < <(cat "$cgpath"/cgroup.procs || true) + if [ ${#pids[@]} -gt 0 ]; then + echo "expected empty cgroup.procs, got:" "${pids[@]}" 1>&2 + return 1 + fi } @test "kill detached busybox" {