From 79eaadd3fb480255fa0b9fbfc5517425b7c1730b Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 15 Dec 2020 14:40:16 -0700 Subject: [PATCH] podman upgrade tests Initial validation of using podman-in-podman to create an old-podman root, then use new-podman to play with the containers created therein. Signed-off-by: Ed Santiago --- .cirrus.yml | 33 +++ contrib/cirrus/runner.sh | 4 + contrib/cirrus/setup_environment.sh | 1 + test/system/helpers.bash | 2 +- test/upgrade/README.md | 87 ++++++++ test/upgrade/helpers.bash | 11 + test/upgrade/test-upgrade.bats | 313 ++++++++++++++++++++++++++++ 7 files changed, 450 insertions(+), 1 deletion(-) create mode 100644 test/upgrade/README.md create mode 100644 test/upgrade/helpers.bash create mode 100644 test/upgrade/test-upgrade.bats diff --git a/.cirrus.yml b/.cirrus.yml index a23595712f..ae8a4dc3ae 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -598,6 +598,38 @@ rootless_system_test_task: main_script: *main always: *logs_artifacts +# FIXME: we may want to consider running this from nightly cron instead of CI. +# The tests are actually pretty quick (less than a minute) but they do rely +# on pulling images from quay.io, which means we're subject to network flakes. +# +# FIXME: how does this env matrix work, anyway? Does it spin up multiple VMs? +# We might just want to encode the version matrix in runner.sh instead +upgrade_test_task: + name: "Upgrade test: from $PODMAN_UPGRADE_FROM" + alias: upgrade_test + skip: *tags + only_if: *not_docs + depends_on: + - local_system_test + matrix: + - env: + PODMAN_UPGRADE_FROM: v1.9.0 + - env: + PODMAN_UPGRADE_FROM: v2.0.6 + - env: + PODMAN_UPGRADE_FROM: v2.1.1 + gce_instance: *standardvm + env: + TEST_FLAVOR: upgrade_test + DISTRO_NV: ${FEDORA_NAME} + VM_IMAGE_NAME: ${FEDORA_CACHE_IMAGE_NAME} + # ID for re-use of build output + _BUILD_CACHE_HANDLE: ${FEDORA_NAME}-build-${CIRRUS_BUILD_ID} + clone_script: *noop + gopath_cache: *ro_gopath_cache + setup_script: *setup + main_script: *main + always: *logs_artifacts # This task is critical. It updates the "last-used by" timestamp stored # in metadata for all VM images. This mechanism functions in tandem with @@ -654,6 +686,7 @@ success_task: - local_system_test - remote_system_test - rootless_system_test + - upgrade_test - meta container: *smallcontainer env: diff --git a/contrib/cirrus/runner.sh b/contrib/cirrus/runner.sh index ccbdb63b6e..fca9aff931 100755 --- a/contrib/cirrus/runner.sh +++ b/contrib/cirrus/runner.sh @@ -70,6 +70,10 @@ function _run_sys() { dotest system } +function _run_upgrade_test() { + bats test/upgrade |& logformatter +} + function _run_bindings() { # shellcheck disable=SC2155 export PATH=$PATH:$GOSRC/hack diff --git a/contrib/cirrus/setup_environment.sh b/contrib/cirrus/setup_environment.sh index 4c95d02542..64ea3b7b4e 100755 --- a/contrib/cirrus/setup_environment.sh +++ b/contrib/cirrus/setup_environment.sh @@ -200,6 +200,7 @@ case "$TEST_FLAVOR" in compose) ;& int) ;& sys) ;& + upgrade_test) ;& bindings) ;& endpoint) # Use existing host bits when testing is to happen inside a container diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 0572c6866e..4a7823852c 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -149,7 +149,7 @@ function run_podman() { echo "$_LOG_PROMPT $PODMAN $*" # BATS hangs if a subprocess remains and keeps FD 3 open; this happens # if podman crashes unexpectedly without cleaning up subprocesses. - run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN "$@" 3>/dev/null + run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN $_PODMAN_TEST_OPTS "$@" 3>/dev/null # without "quotes", multiple lines are glommed together into one if [ -n "$output" ]; then echo "$output" diff --git a/test/upgrade/README.md b/test/upgrade/README.md new file mode 100644 index 0000000000..2979a66d74 --- /dev/null +++ b/test/upgrade/README.md @@ -0,0 +1,87 @@ +Background +========== + +For years we've been needing a way to test podman upgrades; this +became much more critical on December 7, 2020, when Matt disclosed +a bug he had found over the weekend +([#8613](https://github.com/containers/podman/issues/8613)) +in which reuse of a previously-defined field name would +result in fatal JSON decode failures if current-podman were +to try reading containers created with podman <= 1.8 (FIXME: confirm) + +Upgrade testing is a daunting problem; but in the December 12 +Cabal meeting Dan suggested using podman-in-podman. This PR +is the result of fleshing out that idea. + +Overview +======== + +The BATS script in this directory fetches and runs an old-podman +container image from quay.io/podman, uses it to create and run +a number of containers, then uses new-podman to interact with +those containers. + +As of 2021-02-23 the available old-podman versions are: + +```console +$ ./bin/podman search --list-tags quay.io/podman/stable | awk '$2 ~ /^v/ { print $2}' | sort | column -c 75 +v1.4.2 v1.5.0 v1.6 v1.9.0 v2.0.2 v2.1.1 +v1.4.4 v1.5.1 v1.6.2 v1.9.1 v2.0.6 v2.2.1 +``` + +Test invocation is: +```console +$ sudo env PODMAN=bin/podman PODMAN_UPGRADE_FROM=v1.9.0 PODMAN_UPGRADE_TEST_DEBUG= bats test/upgrade +``` +(Path assumes you're cd'ed to top-level podman repo). `PODMAN_UPGRADE_FROM` +can be any of the versions above. `PODMAN_UPGRADE_TEST_DEBUG` is empty +here, but listed so you can set it `=1` and leave the podman_parent +container running. Interacting with this container is left as an +exercise for the reader. + +The script will pull the given podman image, invoke it with a scratch +root directory, and have it do a small set of podman stuff (pull an +image, create/run some containers). This podman process stays running +because if it exits, it kills containers running inside the container. + +We then invoke the current (host-installed) podman, using the same +scratch root directory, and perform operations on those images and +containers. Most of those operations are done in individual @tests. + +The goal is to have this upgrade test run in CI, iterating over a +loop of known old versions. This list would need to be hand-maintained +and updated on new releases. There might also need to be extra +configuration defined, such as per-version commands (see below). + +Findings +======== + +Well, first, `v1.6.2` won't work on default f32/f33: the image +does not include `crun`, so it can't work at all: + + ERRO[0000] oci runtime "runc" does not support CGroups V2: use system migrate to mitigate + +I realize that it's kind of stupid not to test 1.6, since that's +precisely the test that would've caught #8613 early, but I just +don't think it's worth the hassle of setting up cgroupsv1 VMs. + +For posterity, in an earlier incantation of this script I tried +booting f32 into cgroupsv1 and ran into the following warnings +when running new-podman on old-containers: +``` +ERRO[0000] error joining network namespace for container 322b66d94640e31b2e6921565445cf0dade4ec13cabc16ee5f29292bdc038341: error retrieving network namespace at /var/run/netns/cni-577e2289-2c05-2e28-3c3d-002a5596e7da: failed to Statfs "/var/run/netns/cni-577e2289 +``` + +Where To Go From Here +===================== + +* Tests are still (2021-02-23) incomplete, with several failing outright. + See FIXMEs in the code. + +* Figuring out how/if to run rootless. I think this is possible, perhaps + even necessary, but will be tricky to get right because of home-directory + mounting. + +* Figuring out how/if to run variations with different config files + (e.g. running OLD-PODMAN that creates a user libpod.conf, tweaking + that in the test, then running NEW-PODMAN upgrate tests) diff --git a/test/upgrade/helpers.bash b/test/upgrade/helpers.bash new file mode 100644 index 0000000000..41d9279e6d --- /dev/null +++ b/test/upgrade/helpers.bash @@ -0,0 +1,11 @@ +# -*- bash -*- + +load "../system/helpers" + +setup() { + : +} + +teardown() { + : +} diff --git a/test/upgrade/test-upgrade.bats b/test/upgrade/test-upgrade.bats new file mode 100644 index 0000000000..dd827b3987 --- /dev/null +++ b/test/upgrade/test-upgrade.bats @@ -0,0 +1,313 @@ +# -*- bats -*- + +load helpers + +# Create a var-lib-containers dir for this podman. We need to bind-mount +# this into the container, and use --root and --runroot and --tmpdir +# options both in the container podman and out here: that's the only +# way to share image and container storage. +if [ -z "${PODMAN_UPGRADE_WORKDIR}" ]; then + # Much as I'd love a descriptive name like "podman-upgrade-tests.XXXXX", + # keep it short ("pu") because of the 100-character path length limit + # for UNIX sockets (needed by conmon) + export PODMAN_UPGRADE_WORKDIR=$(mktemp -d --tmpdir=${BATS_TMPDIR:-${TMPDIR:-/tmp}} pu.XXXXXX) + + touch $PODMAN_UPGRADE_WORKDIR/status +fi + +# Generate a set of random strings used for content verification +if [ -z "${RANDOM_STRING_1}" ]; then + export RANDOM_STRING_1=$(random_string 15) + export LABEL_CREATED=$(random_string 16) + export LABEL_FAILED=$(random_string 17) + export LABEL_RUNNING=$(random_string 18) + + # FIXME: randomize this + HOST_PORT=34567 +fi + +# Version string of the podman we're actually testing, e.g. '3.0.0-dev-d1a26013' +PODMAN_VERSION=$($PODMAN version |awk '/^Version:/ { V=$2 } /^Git Commit:/ { G=$3 } END { print V "-" substr(G,0,8) }') + +setup() { + skip_if_rootless + + # The podman-in-podman image (old podman) + if [[ -z "$PODMAN_UPGRADE_FROM" ]]; then + echo "# \$PODMAN_UPGRADE_FROM is undefined (should be e.g. v1.9.0)" >&3 + false + fi + + if [ "$(< $PODMAN_UPGRADE_WORKDIR/status)" = "failed" ]; then + # FIXME: exit instead? + echo "*** setup failed - no point in running tests" + false + fi + + export _PODMAN_TEST_OPTS="--root=$PODMAN_UPGRADE_WORKDIR/root --runroot=$PODMAN_UPGRADE_WORKDIR/runroot --tmpdir=$PODMAN_UPGRADE_WORKDIR/tmp" +} + +############################################################################### +# BEGIN setup + +@test "initial setup: start $PODMAN_UPGRADE_FROM containers" { + echo failed >| $PODMAN_UPGRADE_WORKDIR/status + + OLD_PODMAN=quay.io/podman/stable:$PODMAN_UPGRADE_FROM + $PODMAN pull $OLD_PODMAN + + # Shortcut name, because we're referencing it a lot + pmroot=$PODMAN_UPGRADE_WORKDIR + + # WWW content to share + mkdir -p $pmroot/var/www + echo $RANDOM_STRING_1 >$pmroot/var/www/index.txt + + # podman tmpdir + mkdir -p $pmroot/tmp + + # + # Script to run >>OLD<< podman commands. + # + # These commands will be run inside a podman container. The "podman" + # command in this script will be the desired old-podman version. + # + pmscript=$pmroot/setup + cat >| $pmscript <| $PODMAN_UPGRADE_WORKDIR/status +} + +# END setup +############################################################################### +# BEGIN actual tests + +# This is a NOP; used only so the version string will show up in logs +@test "upgrade: $PODMAN_UPGRADE_FROM -> $PODMAN_VERSION" { + : +} + +@test "images" { + run_podman images -a --format '{{.Names}}' + is "$output" "\[$IMAGE\]" "podman images" +} + +@test "ps : one container running" { + run_podman ps --format '{{.Image}}--{{.Names}}' + is "$output" "$IMAGE--myrunningcontainer" "ps: one container running" +} + +@test "ps -a : shows all containers" { + # IMPORTANT: we can't use --sort=created, because that requires #8427 + # on the *creating* podman end. + run_podman ps -a \ + --format '{{.Names}}--{{.Status}}--{{.Ports}}--{{.Labels.mylabel}}' \ + --sort=names + is "${lines[0]}" "mycreatedcontainer--Created----$LABEL_CREATED" "created" + is "${lines[1]}" "mydonecontainer--Exited (0).*----" "done" + is "${lines[2]}" "myfailedcontainer--Exited (17) .*----$LABEL_FAILED" "fail" + is "${lines[3]}" "myrunningcontainer--Up .*----$LABEL_RUNNING" "running" + + # For debugging: dump containers and IDs + if [[ -n "$PODMAN_UPGRADE_TEST_DEBUG" ]]; then + run_podman ps -a + for l in "${lines[@]}"; do + echo "# $l" >&3 + done + fi +} + + +@test "inspect - all container status" { + tests=" +running | running | 0 +created | configured | 0 +done | exited | 0 +failed | exited | 17 +" + while read cname state exitstatus; do + run_podman inspect --format '{{.State.Status}}--{{.State.ExitCode}}' my${cname}container + is "$output" "$state--$exitstatus" "status of my${cname}container" + done < <(parse_table "$tests") +} + +@test "logs" { + run_podman logs mydonecontainer + is "$output" "++$RANDOM_STRING_1++" "podman logs on stopped container" + +# run_podman logs myrunningcontainer +# is "$output" "READY" "podman logs on running container" +} + +@test "exec" { + run_podman exec myrunningcontainer cat /var/www/index.txt + is "$output" "$RANDOM_STRING_1" "exec into myrunningcontainer" +} + +@test "load" { + # FIXME, is this really necessary? + skip "TBI. Not sure if there's any point to this." +} + +@test "mount" { + skip "TBI" +} + +@test "pods" { + skip "TBI" +} + +# FIXME: commit? kill? network? pause? restart? top? volumes? What else? + + +@test "start" { + skip "FIXME: this leaves a mount behind: root/overlay/sha/merged" + run_podman --cgroup-manager=cgroupfs start -a mydonecontainer + is "$output" "++$RANDOM_STRING_1++" "start on already-run container" +} + +@test "rm a stopped container" { + # FIXME FIXME FIXME! + # + # I have no idea what's going on here. For most of my testing in this + # section, the code here was simply 'podman rm myfailedcontainer', and + # it would succeed, but then way down, in 'cleanup' below, the 'rm -f' + # step would fail: + # + # # podman rm -f podman_parent + # error freeing lock for container : no such file or directory + # ...where is the ID of the podman_parent container. + # + # I started playing with this section, by adding 'rm mydonecontainer', + # and now it always fails, the same way, but with the container we're + # removing right here: + # + # error freeing lock for container : no such file or directory + # ...where is the ID of mydonecontainer. + # + # I don't know. I give up for now, and am skip'ing the whole thing. + # If you want to play with it, try commenting out the 'myfailed' lines, + # or just the 'mydone' ones, or, I don't know. + skip "FIXME: error freeing lock for container : no such file or dir" + + # For debugging, so we can see what 'error freeing lock' refers to + run_podman ps -a + + run_podman rm myfailedcontainer + is "$output" "[0-9a-f]\\{64\\}" "podman rm myfailedcontainer" + + run_podman rm mydonecontainer + is "$output" "[0-9a-f]\\{64\\}" "podman rm mydonecontainer" +} + + +@test "stop and rm" { + # About a ten-second pause, then: + # Error: timed out waiting for file /tmp/pu.nf747w/tmp/exits/: internal libpod error + # It doesn't seem to be a socket-length issue: the paths are ~80-88 chars. + # Leaving podman_parent running, and exec'ing into it, it doesn't look + # like the file is being written to the wrong place. + skip "FIXME: this doesn't work: timed out waiting for file tmpdir/exits/sha" + run_podman stop myrunningcontainer + run_podman rm myrunningcontainer +} + +@test "clean up parent" { + if [[ -n "$PODMAN_UPGRADE_TEST_DEBUG" ]]; then + skip "workdir is $PODMAN_UPGRADE_WORKDIR" + fi + + # We're done with shared environment. By clearing this, we can now + # use run_podman for actions on the podman_parent container + unset _PODMAN_TEST_OPTS + + # (Useful for debugging the 'rm -f' step below, which, when it fails, only + # gives a container ID. This 'ps' confirms that the CID is podman_parent) + run_podman ps -a + + # Stop the container gracefully + run_podman exec podman_parent touch /stop + run_podman wait podman_parent + + run_podman logs podman_parent + run_podman rm -f podman_parent + + # FIXME: why does this remain mounted? + umount $PODMAN_UPGRADE_WORKDIR/root/overlay || true + + rm -rf $PODMAN_UPGRADE_WORKDIR +} + +# FIXME: now clean up