Skip to content

Commit

Permalink
chown cgroup to process uid in container namespace
Browse files Browse the repository at this point in the history
Delegating cgroups to the container enables more complex workloads,
including systemd-based workloads.  The OCI runtime-spec was
recently updated to explicitly admit such delegation, through
specification of cgroup ownership semantics:

  opencontainers/runtime-spec#1123

Pursuant to the updated OCI runtime-spec, change the ownership of
the container's cgroup directory and particular files therein, when
using cgroups v2 and when the cgroupfs is to be mounted read/write.

As a result of this change, systemd workloads can run in isolated
user namespaces on OpenShift when the sandbox's cgroupfs is mounted
read/write.

It might be possible to implement this feature in other cgroup
managers, but that work is deferred.

Signed-off-by: Fraser Tweedale <[email protected]>
  • Loading branch information
frasertweedale committed Nov 29, 2021
1 parent 6ff0420 commit 35d20c4
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 0 deletions.
39 changes: 39 additions & 0 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package systemd

import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
Expand Down Expand Up @@ -288,9 +290,46 @@ func (m *unifiedManager) Apply(pid int) error {
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
return err
}

if c.OwnerUID != nil {
filesToChown, err := cgroupFilesToChown()
if err != nil {
return err
}

for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
if err != nil {
return err
}
}
}

return nil
}

// The kernel exposes a list of files that should be chowned to the delegate
// uid in /sys/kernel/cgroup/delegate. If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
filesToChown := []string{"."} // the directory itself must be chowned
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err == nil {
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
} else {
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
}
return filesToChown, nil
}

func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
Expand Down
7 changes: 7 additions & 0 deletions libcontainer/configs/cgroup_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ type Cgroup struct {

// Rootless tells if rootless cgroups should be used.
Rootless bool

// The host UID that should own the cgroup, or nil to accept
// the default ownership. This should only be set when the
// cgroupfs is to be mounted read/write.
// Not all cgroup manager implementations support changing
// the ownership.
OwnerUID *int `json:"owner_uid,omitempty"`
}

type Resources struct {
Expand Down
43 changes: 43 additions & 0 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
}
}

// Set the host UID that should own the container's cgroup.
// This must be performed after setupUserNamespace, so that
// config.HostRootUID() returns the correct result.
//
// Only set it if the container will have its own cgroup
// namespace and the cgroupfs will be mounted read/write.
//
hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == ""
hasRwCgroupfs := false
if hasCgroupNS {
for _, m := range config.Mounts {
if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
hasRwCgroupfs = true
break
}
}
}
processUid := 0
if spec.Process != nil {
// Chown the cgroup to the UID running the process,
// which is not necessarily UID 0 in the container
// namespace (e.g., an unprivileged UID in the host
// user namespace).
processUid = int(spec.Process.User.UID)
}
if hasCgroupNS && hasRwCgroupfs {
ownerUid, err := config.HostUID(processUid)
// There are two error cases; we can ignore both.
//
// 1. uidMappings is unset. Either there is no user
// namespace (fine), or it is an error (which is
// checked elsewhere).
//
// 2. The user is unmapped in the user namespace. This is an
// unusual configuration and might be an error. But it too
// will be checked elsewhere, so we can ignore it here.
//
if err == nil {
config.Cgroups.OwnerUID = &ownerUid
}
}

if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
config.NoNewPrivileges = spec.Process.NoNewPrivileges
Expand Down
61 changes: 61 additions & 0 deletions tests/integration/cgroup_delegation.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env bats

load helpers

function teardown() {
teardown_bundle
}

function setup() {
requires root cgroups_v2 systemd

setup_busybox

# chown test temp dir to allow host user to read it
chown 100000 "$ROOT"

# chown rootfs to allow host user to mkdir mount points
chown 100000 "$ROOT"/bundle/rootfs

set_cgroups_path

# configure a user namespace
update_config ' .linux.namespaces += [{"type": "user"}]
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
'
}

@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" {
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
}

@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" {
set_cgroup_mount_writable

# inherit cgroup namespace (remove cgroup from namespaces list)
update_config '.linux.namespaces |= map(select(.type != "cgroup"))'

runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
}

@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" {
set_cgroup_mount_writable

runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace)
}

0 comments on commit 35d20c4

Please sign in to comment.