From 2d1f4a8bffbf81f5b047ea1609f324aaf9763586 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Fri, 10 Mar 2023 21:24:21 +0100 Subject: [PATCH] cgroupns: private cgroupns on cgroupv1 breaks --systemd On cgroup v1 we need to mount only the systemd named hierarchy as writeable, so we configure the OCI runtime to mount /sys/fs/cgroup as read-only and on top of that bind mount /sys/fs/cgroup/systemd. But when we use a private cgroupns, we cannot do that since we don't know the final cgroup path. Also, do not override the mount if there is already one for /sys/fs/cgroup/systemd. Closes: https://github.com/containers/podman/issues/17727 Signed-off-by: Giuseppe Scrivano --- libpod/container_internal_linux.go | 84 ++++++++++++++++-------------- test/system/250-systemd.bats | 7 +++ test/system/helpers.bash | 9 ++++ 3 files changed, 61 insertions(+), 39 deletions(-) diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 7988acdcd2..3a405225fc 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -242,17 +242,17 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro return err } + hasCgroupNs := false + for _, ns := range c.config.Spec.Linux.Namespaces { + if ns.Type == spec.CgroupNamespace { + hasCgroupNs = true + break + } + } + if unified { g.RemoveMount("/sys/fs/cgroup") - hasCgroupNs := false - for _, ns := range c.config.Spec.Linux.Namespaces { - if ns.Type == spec.CgroupNamespace { - hasCgroupNs = true - break - } - } - var systemdMnt spec.Mount if hasCgroupNs { systemdMnt = spec.Mount{ @@ -271,40 +271,46 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro } g.AddMount(systemdMnt) } else { + hasSystemdMount := MountExists(mounts, "/sys/fs/cgroup/systemd") + if hasCgroupNs && !hasSystemdMount { + return errors.New("cgroup namespace is not supported with cgroup v1 and systemd mode") + } mountOptions := []string{"bind", "rprivate"} - skipMount := false - - var statfs unix.Statfs_t - if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil { - if errors.Is(err, os.ErrNotExist) { - // If the mount is missing on the host, we cannot bind mount it so - // just skip it. - skipMount = true - } - mountOptions = append(mountOptions, "nodev", "noexec", "nosuid") - } else { - if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV { - mountOptions = append(mountOptions, "nodev") - } - if statfs.Flags&unix.MS_NOEXEC == unix.MS_NOEXEC { - mountOptions = append(mountOptions, "noexec") - } - if statfs.Flags&unix.MS_NOSUID == unix.MS_NOSUID { - mountOptions = append(mountOptions, "nosuid") - } - if statfs.Flags&unix.MS_RDONLY == unix.MS_RDONLY { - mountOptions = append(mountOptions, "ro") + + if !hasSystemdMount { + skipMount := hasSystemdMount + var statfs unix.Statfs_t + if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil { + if errors.Is(err, os.ErrNotExist) { + // If the mount is missing on the host, we cannot bind mount it so + // just skip it. + skipMount = true + } + mountOptions = append(mountOptions, "nodev", "noexec", "nosuid") + } else { + if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV { + mountOptions = append(mountOptions, "nodev") + } + if statfs.Flags&unix.MS_NOEXEC == unix.MS_NOEXEC { + mountOptions = append(mountOptions, "noexec") + } + if statfs.Flags&unix.MS_NOSUID == unix.MS_NOSUID { + mountOptions = append(mountOptions, "nosuid") + } + if statfs.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + mountOptions = append(mountOptions, "ro") + } } - } - if !skipMount { - systemdMnt := spec.Mount{ - Destination: "/sys/fs/cgroup/systemd", - Type: "bind", - Source: "/sys/fs/cgroup/systemd", - Options: mountOptions, + if !skipMount { + systemdMnt := spec.Mount{ + Destination: "/sys/fs/cgroup/systemd", + Type: "bind", + Source: "/sys/fs/cgroup/systemd", + Options: mountOptions, + } + g.AddMount(systemdMnt) + g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } - g.AddMount(systemdMnt) - g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } } diff --git a/test/system/250-systemd.bats b/test/system/250-systemd.bats index 8c3064f9ef..f051b2146d 100644 --- a/test/system/250-systemd.bats +++ b/test/system/250-systemd.bats @@ -280,6 +280,13 @@ LISTEN_FDNAMES=listen_fdnames" | sort) is "${container_uuid}" "${output:0:32}" "UUID should be first 32 chars of Container id" } +@test "podman --systemd fails on cgroup v1 with a private cgroupns" { + skip_if_cgroupsv2 + + run_podman 126 run --systemd=always --cgroupns=private $IMAGE true + assert "$output" =~ ".*cgroup namespace is not supported with cgroup v1 and systemd mode" +} + # https://github.com/containers/podman/issues/13153 @test "podman rootless-netns slirp4netns process should be in different cgroup" { is_rootless || skip "only meaningful for rootless" diff --git a/test/system/helpers.bash b/test/system/helpers.bash index c43936dde1..57d3ee314a 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -497,6 +497,15 @@ function skip_if_cgroupsv1() { fi } +####################### +# skip_if_cgroupsv2 # ...with an optional message +####################### +function skip_if_cgroupsv2() { + if is_cgroupsv2; then + skip "${1:-test requires cgroupsv1}" + fi +} + ###################### # skip_if_rootless_cgroupsv1 # ...with an optional message ######################