From 1a7c64356d5499399f79ce12788ca523f7d52149 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 26 Apr 2023 14:01:47 -0700 Subject: [PATCH] libct/cg/sd/v2: rely on systemd to set device rules It seems that the code added by commit b810da149008f1d7d0 had cgroup v1 in mind, where runc overwrites the rules set by systemd. It is all different in v2, because both ebpf programs (systemd's and runc's) have to say "allow" for the device to get access. So, when using cgroup v2 with systemd cgroup driver, access to devices rules for that can't be translated to systemd properties is not possible at all, and it makes sense to error out (rather than warn) in such case, as the container won't work as intended. With this change in mind, provided that runc correctly translates all the device access rule, and systemd correctly applies those, we no longer have to create and apply a second eBPF program with own rules. Let's stop doing that, instead relying on systemd only. Having two sets of rules (two ebpf programs) for cgroupv2/ebpf is problematic for two reasons: 1. Both sets should say "ok" for access to be granted (as explained by the previous commit). 2. After systemd daemon-reload (which happens during routine systemd upgrade) the program runc adds is removed, so it's a time-bomb. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/devices/systemd.go | 35 +++++++++++++++++++------ libcontainer/cgroups/fs2/fs2.go | 15 ++++++----- libcontainer/cgroups/systemd/common.go | 6 ++--- libcontainer/cgroups/systemd/v1.go | 2 +- libcontainer/cgroups/systemd/v2.go | 7 +---- 5 files changed, 40 insertions(+), 25 deletions(-) diff --git a/libcontainer/cgroups/devices/systemd.go b/libcontainer/cgroups/devices/systemd.go index 70768f9dbbb..403752c8698 100644 --- a/libcontainer/cgroups/devices/systemd.go +++ b/libcontainer/cgroups/devices/systemd.go @@ -17,7 +17,7 @@ import ( // systemdProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) { +func systemdProperties(r *configs.Resources, sdVer int, cgroupVer int) ([]systemdDbus.Property, error) { if r.SkipDevices { return nil, nil } @@ -95,12 +95,26 @@ func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, // The only type of rule we can't handle is wildcard-major rules, and // so we'll give a warning in that case (note that the fallback code // will insert any rules systemd couldn't handle). What amazing fun. + // + // The properties generated here are applied by systemd. In case of + // cgroup v1 ("device.{allow,deny}" files), runc when overwrites the + // systemd rules, so the correct rules are in effect (except for the + // case when systemd is restarted and our rules gets overwritten, but + // there is nothing we can do here). + // + // For cgroup v2, we only use systemd rules, so we error out if + // we can't translate some rules. if rule.Major == devices.Wildcard { // "_ *:n _" rules aren't supported by systemd. if rule.Minor != devices.Wildcard { - logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + w := fmt.Errorf("systemd does not support wildcard-major device rule: %+v", *rule) + if cgroupVer == 2 { + return nil, w + } + logrus.Warnf("temporarily ignoring rule: %v", w) continue + } // "_ *:* _" rules just wildcard everything. @@ -117,7 +131,11 @@ func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, } if group == "" { // Couldn't find a group. - logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + w := fmt.Errorf("systemd older than v240 does not support wildcard-minor rules for devices not listed in /proc/devices: +%v", *rule) + if cgroupVer == 2 { + return nil, err + } + logrus.Warnf("temporarily ignoring rule: %v", w) continue } entry.Path = group @@ -131,12 +149,13 @@ func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, } if sdVer < 240 { // Old systemd versions use stat(2) on path to find out device major:minor - // numbers and type. If the path doesn't exist, it will not add the rule, - // emitting a warning instead. - // Since all of this logic is best-effort anyway (we manually set these - // rules separately to systemd) we can safely skip entries that don't - // have a corresponding path. + // numbers and type. If the path doesn't exist, it will not add the rule. if _, err := os.Stat(entry.Path); err != nil { + w := fmt.Errorf("systemd older than v240 does not support device rules for non-existing device: %w", err) + if cgroupVer == 2 { + return nil, err + } + logrus.Warnf("temporarily ignoring rule: %v", w) continue } } diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 42b5bcb60cf..fba077d080d 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -171,13 +171,14 @@ func (m *Manager) Set(r *configs.Resources) error { return err } // devices (since kernel 4.15, pseudo-controller) - // - // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if err := setDevices(m.dirPath, r); err != nil { - if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { - return err + if !m.config.Systemd { + // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, r); err != nil { + if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { + return err + } } } // cpuset (since kernel 5.0) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index c577a22b03a..10bd34a6e11 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -33,7 +33,7 @@ var ( isRunningSystemdOnce sync.Once isRunningSystemd bool - GenerateDeviceProps func(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) + GenerateDeviceProps func(r *configs.Resources, sdVer, cgroupVer int) ([]systemdDbus.Property, error) ) // NOTE: This function comes from package github.com/coreos/go-systemd/util @@ -342,7 +342,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st // generateDeviceProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { +func generateDeviceProperties(r *configs.Resources, cm *dbusConnManager, cgroupVersion int) ([]systemdDbus.Property, error) { if GenerateDeviceProps == nil { if len(r.Devices) > 0 { return nil, cgroups.ErrDevicesUnsupported @@ -350,5 +350,5 @@ func generateDeviceProperties(r *configs.Resources, cm *dbusConnManager) ([]syst return nil, nil } - return GenerateDeviceProps(r, systemdVersion(cm)) + return GenerateDeviceProps(r, systemdVersion(cm), cgroupVersion) } diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index ffe87798782..073dc9a8efd 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -75,7 +75,7 @@ var legacySubsystems = []subsystem{ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property - deviceProperties, err := generateDeviceProperties(r, cm) + deviceProperties, err := generateDeviceProperties(r, cm, 1) if err != nil { return nil, err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index b28ec6b22f2..9cf3ee6d725 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -209,12 +209,7 @@ func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConn var properties []systemdDbus.Property - // NOTE: This is of questionable correctness because we insert our own - // devices eBPF program later. Two programs with identical rules - // aren't the end of the world, but it is a bit concerning. However - // it's unclear if systemd removes all eBPF programs attached when - // doing SetUnitProperties... - deviceProperties, err := generateDeviceProperties(r, cm) + deviceProperties, err := generateDeviceProperties(r, cm, 2) if err != nil { return nil, err }