From fb5050d5bea74688dacc0f3e963c84913c5efacb Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Thu, 14 Dec 2023 16:46:14 -0500 Subject: [PATCH] Add concept of state overlays In the OSTree model, executables go in `/usr`, state in `/var` and configuration in `/etc`. Software that lives in `/opt` however messes this up because it often mixes code *and* state, making it harder to manage. More generally, it's sometimes useful to have the OSTree commit contain code under a certain path, but still allow that path to be writable by software and the sysadmin at runtime (`/usr/local` is another instance). Add the concept of state overlays. A state overlay is an overlayfs mount whose upper directory, which contains unmanaged state, is carried forward on top of a lower directory, containing OSTree-managed files. In the example of `/usr/local`, OSTree commits can ship content there, all while allowing users to e.g. add scripts in `/usr/local/bin` when booted into that commit. Some reconciliation logic is executed whenever the base is updated so that newer files in the base are never shadowed by a copied up version in the upper directory. This matches RPM semantics when upgrading packages whose files may have been modified. For ease of integration, this is exposed as a systemd template unit which any downstream distro/user can enable. The instance name is the mountpath in escaped systemd path notation (e.g. `ostree-state-overlay@usr-local.service`). See discussions in https://github.com/ostreedev/ostree/issues/3113 for more details. --- Makefile-boot.am | 2 + Makefile-ostree.am | 1 + src/boot/ostree-state-overlay@.service | 36 +++ src/ostree/ot-admin-builtin-state-overlay.c | 243 ++++++++++++++++++++ src/ostree/ot-admin-builtins.h | 1 + src/ostree/ot-builtin-admin.c | 2 + tests/kolainst/destructive/state-overlay.sh | 146 ++++++++++++ 7 files changed, 431 insertions(+) create mode 100644 src/boot/ostree-state-overlay@.service create mode 100644 src/ostree/ot-admin-builtin-state-overlay.c create mode 100755 tests/kolainst/destructive/state-overlay.sh diff --git a/Makefile-boot.am b/Makefile-boot.am index 90f9804834..c07b6b8123 100644 --- a/Makefile-boot.am +++ b/Makefile-boot.am @@ -42,6 +42,7 @@ systemdsystemunit_DATA = src/boot/ostree-prepare-root.service \ src/boot/ostree-finalize-staged.service \ src/boot/ostree-finalize-staged.path \ src/boot/ostree-finalize-staged-hold.service \ + src/boot/ostree-state-overlay@.service \ $(NULL) systemdtmpfilesdir = $(prefix)/lib/tmpfiles.d dist_systemdtmpfiles_DATA = src/boot/ostree-tmpfiles.conf @@ -72,6 +73,7 @@ EXTRA_DIST += src/boot/dracut/module-setup.sh \ src/boot/ostree-remount.service \ src/boot/ostree-finalize-staged.service \ src/boot/ostree-finalize-staged-hold.service \ + src/boot/ostree-state-overlay@.service \ src/boot/grub2/grub2-15_ostree \ src/boot/grub2/ostree-grub-generator \ $(NULL) diff --git a/Makefile-ostree.am b/Makefile-ostree.am index ade079c976..d2447ffe9d 100644 --- a/Makefile-ostree.am +++ b/Makefile-ostree.am @@ -85,6 +85,7 @@ ostree_SOURCES += \ src/ostree/ot-admin-builtin-post-copy.c \ src/ostree/ot-admin-builtin-upgrade.c \ src/ostree/ot-admin-builtin-unlock.c \ + src/ostree/ot-admin-builtin-state-overlay.c \ src/ostree/ot-admin-builtins.h \ src/ostree/ot-admin-instutil-builtin-selinux-ensure-labeled.c \ src/ostree/ot-admin-instutil-builtin-set-kargs.c \ diff --git a/src/boot/ostree-state-overlay@.service b/src/boot/ostree-state-overlay@.service new file mode 100644 index 0000000000..dc8aeac51b --- /dev/null +++ b/src/boot/ostree-state-overlay@.service @@ -0,0 +1,36 @@ +# Copyright (C) 2023 Red Hat Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see . + +[Unit] +Description=OSTree State Overlay On /%I +Documentation=man:ostree(1) +DefaultDependencies=no +ConditionKernelCommandLine=ostree +# run after /var is setup since that's where the upperdir is stored +# and after boot.mount so we can load the sysroot +After=var.mount boot.mount +# but before local-fs.target, which we consider ourselves a part of +Before=local-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/bin/ostree admin state-overlay %i /%I +StandardInput=null +StandardOutput=journal +StandardError=journal+console + +[Install] +WantedBy=local-fs.target diff --git a/src/ostree/ot-admin-builtin-state-overlay.c b/src/ostree/ot-admin-builtin-state-overlay.c new file mode 100644 index 0000000000..65a0ce1b85 --- /dev/null +++ b/src/ostree/ot-admin-builtin-state-overlay.c @@ -0,0 +1,243 @@ +/* Copyright (C) 2023 Red Hat, Inc. + * + * SPDX-License-Identifier: LGPL-2.0+ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include "glnx-errors.h" +#include "glnx-fdio.h" +#include "glnx-local-alloc.h" +#include "glnx-shutil.h" +#include "glnx-xattrs.h" +#include "ot-admin-builtins.h" + +#define OSTREE_STATEOVERLAYS_DIR "/var/ostree/state-overlays" +#define OSTREE_STATEOVERLAY_UPPER_DIR "upper" +#define OSTREE_STATEOVERLAY_WORK_DIR "work" + +/* https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html */ +#define OVERLAYFS_DIR_XATTR_OPAQUE "trusted.overlay.opaque" + +static GOptionEntry options[] = { { NULL } }; + +static gboolean +ensure_overlay_dirs (const char *overlay_dir, int *out_overlay_dfd, GCancellable *cancellable, + GError **error) +{ + glnx_autofd int overlay_dfd = -1; + if (!glnx_shutil_mkdir_p_at_open (AT_FDCWD, overlay_dir, 0755, &overlay_dfd, cancellable, error)) + return FALSE; + + if (!glnx_shutil_mkdir_p_at (overlay_dfd, OSTREE_STATEOVERLAY_WORK_DIR, 0755, cancellable, error)) + return FALSE; + if (!glnx_shutil_mkdir_p_at (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, 0755, cancellable, + error)) + return FALSE; + + *out_overlay_dfd = glnx_steal_fd (&overlay_dfd); + return TRUE; +} + +static gboolean +is_opaque_dir (int dfd, const char *dname, gboolean *out_is_opaque, GError **error) +{ + /* XXX: this is basically like a `glnx_lgetxattrat_allow_noent()`; upstream it */ + + char pathbuf[PATH_MAX]; + snprintf (pathbuf, sizeof (pathbuf), "/proc/self/fd/%d/%s", dfd, dname); + + ssize_t bytes_read, real_size; + if (TEMP_FAILURE_RETRY (bytes_read = lgetxattr (pathbuf, OVERLAYFS_DIR_XATTR_OPAQUE, NULL, 0)) + < 0) + { + if (errno != ENODATA) + return glnx_throw_errno_prefix (error, "lgetxattr(%s)", OVERLAYFS_DIR_XATTR_OPAQUE); + *out_is_opaque = FALSE; + return TRUE; + } + + g_autofree guint8 *buf = g_malloc (bytes_read); + if (TEMP_FAILURE_RETRY (real_size + = lgetxattr (pathbuf, OVERLAYFS_DIR_XATTR_OPAQUE, buf, bytes_read)) + < 0) + return glnx_throw_errno_prefix (error, "lgetxattr(%s)", OVERLAYFS_DIR_XATTR_OPAQUE); + + *out_is_opaque = (real_size == 1 && buf[0] == 'y'); + return TRUE; +} + +static gboolean +prune_upperdir_recurse (int lower_dfd, int upper_dfd, GCancellable *cancellable, GError **error) +{ + g_auto (GLnxDirFdIterator) dfd_iter = { 0 }; + if (!glnx_dirfd_iterator_init_at (upper_dfd, ".", FALSE, &dfd_iter, error)) + return FALSE; + + while (TRUE) + { + struct dirent *dent = NULL; + if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error)) + return FALSE; + if (dent == NULL) + break; + + /* do we have an entry of the same name in the lowerdir? */ + struct stat stbuf; + if (!glnx_fstatat_allow_noent (lower_dfd, dent->d_name, &stbuf, AT_SYMLINK_NOFOLLOW, error)) + return FALSE; + if (errno == ENOENT) + continue; /* state file (i.e. upperdir only); carry on */ + + /* ok, it shadows; are they both directories? */ + if (dent->d_type == DT_DIR && S_ISDIR (stbuf.st_mode)) + { + /* is the directory opaque? this stmt expr brought to you by the Rust lobbying group */ + gboolean is_opaque = FALSE; + if (!is_opaque_dir (upper_dfd, dent->d_name, &is_opaque, error)) + return FALSE; + + if (!is_opaque) + { + /* recurse */ + glnx_autofd int lower_subdfd = -1; + if (!glnx_opendirat (lower_dfd, dent->d_name, FALSE, &lower_subdfd, error)) + return FALSE; + glnx_autofd int upper_subdfd = -1; + if (!glnx_opendirat (upper_dfd, dent->d_name, FALSE, &upper_subdfd, error)) + return FALSE; + if (!prune_upperdir_recurse (lower_subdfd, upper_subdfd, cancellable, error)) + return glnx_prefix_error (error, "in %s", dent->d_name); + + continue; + } + + /* fallthrough; implicitly delete opaque directories */ + } + + /* any other case, we prune (this also implicitly covers whiteouts and opaque dirs) */ + if (dent->d_type == DT_DIR) + { + if (!glnx_shutil_rm_rf_at (upper_dfd, dent->d_name, cancellable, error)) + return FALSE; + } + /* just unlinkat(); saves one openat() call */ + else if (!glnx_unlinkat (upper_dfd, dent->d_name, 0, error)) + return FALSE; + } + + return TRUE; +} + +static gboolean +prune_upperdir (int sysroot_fd, const char *deployment_path, const char *mountpath, int overlay_dfd, + GCancellable *cancellable, GError **error) +{ + glnx_autofd int lower_dfd = -1; + if (!glnx_opendirat (AT_FDCWD, mountpath, FALSE, &lower_dfd, error)) + return FALSE; + + glnx_autofd int upper_dfd = -1; + if (!glnx_opendirat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, FALSE, &upper_dfd, error)) + return FALSE; + + if (!prune_upperdir_recurse (lower_dfd, upper_dfd, cancellable, error)) + return FALSE; + + /* touch upperdir to mark prune as completed */ + if (utimensat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, NULL, 0) < 0) + return glnx_throw_errno_prefix (error, "futimens(upper)"); + + return TRUE; +} + +static gboolean +mount_overlay (const char *mountpath, const char *name, GError **error) +{ + /* we could use /proc/self/... with overlay_dfd to avoid these allocations, + * but this gets stringified into the options field in the mount table, and + * being cryptic is not helpful */ + g_autofree char *upperdir + = g_build_filename (OSTREE_STATEOVERLAYS_DIR, name, OSTREE_STATEOVERLAY_UPPER_DIR, NULL); + g_autofree char *workdir + = g_build_filename (OSTREE_STATEOVERLAYS_DIR, name, OSTREE_STATEOVERLAY_WORK_DIR, NULL); + g_autofree char *ovl_options + = g_strdup_printf ("lowerdir=%s,upperdir=%s,workdir=%s", mountpath, upperdir, workdir); + if (mount ("overlay", mountpath, "overlay", MS_SILENT, ovl_options) < 0) + return glnx_throw_errno_prefix (error, "mount(%s)", mountpath); + + return TRUE; +} + +/* Called by ostree-state-overlay@.service. */ +gboolean +ot_admin_builtin_state_overlay (int argc, char **argv, OstreeCommandInvocation *invocation, + GCancellable *cancellable, GError **error) +{ + g_autoptr (GOptionContext) context = g_option_context_new ("NAME MOUNTPATH"); + g_autoptr (OstreeSysroot) sysroot = NULL; + + /* First parse the args without loading the sysroot to see what options are + * set. */ + if (!ostree_admin_option_context_parse (context, options, &argc, &argv, + OSTREE_ADMIN_BUILTIN_FLAG_NONE, invocation, &sysroot, + cancellable, error)) + return FALSE; + + if (argc < 3) + return glnx_throw (error, "Missing NAME or MOUNTPATH"); + + /* Sanity-check */ + OstreeDeployment *booted_deployment = ostree_sysroot_get_booted_deployment (sysroot); + if (booted_deployment == NULL) + return glnx_throw (error, "Must be booted into an OSTree deployment"); + + const char *overlay_name = argv[1]; + const char *mountpath = argv[2]; + + glnx_autofd int overlay_dfd = -1; + g_autofree char *overlay_dir = g_build_filename (OSTREE_STATEOVERLAYS_DIR, overlay_name, NULL); + if (!ensure_overlay_dirs (overlay_dir, &overlay_dfd, cancellable, error)) + return FALSE; + + struct stat stbuf_upper; + if (!glnx_fstatat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, &stbuf_upper, 0, error)) + return FALSE; + + /* We don't use "/" directly here because that may have e.g. an overlay + * slapped on from root.transient or composefs. */ + g_autofree char *deployment_path + = ostree_sysroot_get_deployment_dirpath (sysroot, booted_deployment); + struct stat stbuf_lower; + if (!glnx_fstatat (ostree_sysroot_get_fd (sysroot), deployment_path, &stbuf_lower, 0, error)) + return FALSE; + + if (stbuf_upper.st_mtime < stbuf_lower.st_mtime) + { + /* the lowerdir was updated; prune the upperdir */ + if (!prune_upperdir (ostree_sysroot_get_fd (sysroot), deployment_path, mountpath, overlay_dfd, + cancellable, error)) + return glnx_prefix_error (error, "Pruning upperdir for %s", overlay_name); + } + + return mount_overlay (mountpath, overlay_name, error); +} diff --git a/src/ostree/ot-admin-builtins.h b/src/ostree/ot-admin-builtins.h index 1775384e17..cd1472bf3d 100644 --- a/src/ostree/ot-admin-builtins.h +++ b/src/ostree/ot-admin-builtins.h @@ -50,6 +50,7 @@ BUILTINPROTO (upgrade); BUILTINPROTO (kargs); BUILTINPROTO (post_copy); BUILTINPROTO (lock_finalization); +BUILTINPROTO (state_overlay); #undef BUILTINPROTO diff --git a/src/ostree/ot-builtin-admin.c b/src/ostree/ot-builtin-admin.c index 35a1e115c1..68a54751f0 100644 --- a/src/ostree/ot-builtin-admin.c +++ b/src/ostree/ot-builtin-admin.c @@ -42,6 +42,8 @@ static OstreeCommand admin_subcommands[] = { "Change the finalization locking state of the staged deployment" }, { "boot-complete", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN, ot_admin_builtin_boot_complete, "Internal command to run at boot after an update was applied" }, + { "state-overlay", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN, + ot_admin_builtin_state_overlay, "Internal command to assemble a state overlay" }, { "init-fs", OSTREE_BUILTIN_FLAG_NO_REPO, ot_admin_builtin_init_fs, "Initialize a root filesystem" }, { "instutil", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN, ot_admin_builtin_instutil, diff --git a/tests/kolainst/destructive/state-overlay.sh b/tests/kolainst/destructive/state-overlay.sh new file mode 100755 index 0000000000..4442611e9f --- /dev/null +++ b/tests/kolainst/destructive/state-overlay.sh @@ -0,0 +1,146 @@ +#!/bin/bash +set -xeuo pipefail + +. ${KOLA_EXT_DATA}/libinsttest.sh + +case "${AUTOPKGTEST_REBOOT_MARK:-}" in + "") + # create a new ostree commit with some toplevel content + mkdir -p /var/tmp/rootfs/foobar + (cd /var/tmp/rootfs/foobar + touch an_empty_file + echo 'foobar' > a_non_empty_file + echo 'foobar' > another_file + ln -s an_empty_file a_working_symlink + ln -s enoent a_broken_symlink + mkdir an_empty_subdir + mkdir a_nonempty_subdir + echo foobar > a_nonempty_subdir/foobar + mkdir -p a_deeply/deeply/nested/subdir + echo foobar > a_deeply/deeply/nested/subdir/foobar + + # test content deletion + mkdir a_dir_to_delete + touch a_file_to_delete + ln -s enoent a_symlink_to_delete + + # opaque directory + mkdir a_dir_to_make_opaque + touch a_dir_to_make_opaque/base + ) + + ostree commit --no-bindings -P -b foobar --tree=ref="${host_commit}" --tree=dir=/var/tmp/rootfs + rpm-ostree rebase :foobar + systemctl enable ostree-state-overlay@foobar.service + /tmp/autopkgtest-reboot "2" + ;; + "2") + if ! test -d /foobar; then + fatal "no /foobar toplevel dir" + fi + if [[ $(findmnt /foobar -no SOURCE) != overlay ]]; then + fatal "/foobar is not overlay" + fi + + cd /foobar + + # create some state files (i.e. not shadowing) + echo "state" > state + echo "state" > a_nonempty_subdir/state + echo "state" > a_deeply/deeply/nested/subdir/state + ln -s foobar state_symlink + mkdir state_dir + + # and shadow some base files + + # make empty file non-empty + echo shadow > an_empty_file + # make a file become a directory + rm a_non_empty_file && mkdir a_non_empty_file + # make a file become a symlink + ln -sf some_target another_file + # override a working symlink + ln -sf another_file a_working_symlink + # override a non-working symlink + ln -sf enoent2 a_broken_symlink + # make dir become a file + rmdir an_empty_subdir + touch an_empty_subdir + # override file in a shallow subdir + echo shadow > a_nonempty_subdir/foobar + # override file in a deep subdir + echo shadow > a_deeply/deeply/nested/subdir/foobar + # delete some base files + rmdir a_dir_to_delete + rm a_file_to_delete + rm a_symlink_to_delete + # opaque directory + rm -rf a_dir_to_make_opaque + mkdir a_dir_to_make_opaque + touch a_dir_to_make_opaque/state + + # check that rebooting without upgrading maintains state + /tmp/autopkgtest-reboot "3" + ;; + "3") + cd /foobar + + # check state is still there + assert_file_has_content state state + assert_file_has_content a_nonempty_subdir/state state + assert_file_has_content a_deeply/deeply/nested/subdir/state state + [[ $(readlink state_symlink) == foobar ]] + test -d state_dir + + # check shadowings + assert_file_has_content an_empty_file shadow + test -d a_non_empty_file + [[ $(readlink another_file) == some_target ]] + [[ $(readlink a_working_symlink) == another_file ]] + [[ $(readlink a_broken_symlink) == enoent2 ]] + test -f an_empty_subdir + assert_file_has_content a_nonempty_subdir/foobar shadow + assert_file_has_content a_deeply/deeply/nested/subdir/foobar shadow + ! test -e a_dir_to_delete + ! test -e a_file_to_delete + ! test -e a_symlink_to_delete + # opaque directory + test -d a_dir_to_make_opaque + ! test -e a_dir_to_make_opaque/base + test -e a_dir_to_make_opaque/state + + # now reboot into an upgrade + ostree commit --no-bindings -P -b foobar --tree=ref="${host_commit}" + rpm-ostree upgrade + /tmp/autopkgtest-reboot "4" + ;; + "4") + cd /foobar + + # check state is still there + assert_file_has_content state state + assert_file_has_content a_nonempty_subdir/state state + assert_file_has_content a_deeply/deeply/nested/subdir/state state + [[ $(readlink state_symlink) == foobar ]] + test -d state_dir + + # check shadowings are gone + test -f an_empty_file + assert_file_has_content a_non_empty_file foobar + assert_file_has_content another_file foobar + [[ $(readlink a_working_symlink) == an_empty_file ]] + [[ $(readlink a_broken_symlink) == enoent ]] + test -d an_empty_subdir + test -d a_nonempty_subdir + assert_file_has_content a_nonempty_subdir/foobar foobar + assert_file_has_content a_deeply/deeply/nested/subdir/foobar foobar + test -d a_dir_to_delete + test -f a_file_to_delete + test -L a_symlink_to_delete + # opaque directory + test -d a_dir_to_make_opaque + test -e a_dir_to_make_opaque/base + ! test -e a_dir_to_make_opaque/state + ;; + *) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;; +esac