Skip to content

Commit

Permalink
chown cgroup to process uid in container namespace
Browse files Browse the repository at this point in the history
Delegating cgroups to the container enables more complex workloads,
including systemd-based workloads.  The OCI runtime-spec was
recently updated to explicitly admit such delegation, through
specification of cgroup ownership semantics:

  opencontainers/runtime-spec#1123

Pursuant to the updated OCI runtime-spec, change the ownership of
the container's cgroup directory and particular files therein, when
using cgroups v2 and when the cgroupfs is to be mounted read/write.

As a result of this change, systemd workloads can run in isolated
user namespaces on OpenShift when the sandbox's cgroupfs is mounted
read/write.

It might be possible to implement this feature in other cgroup
managers, but that work is deferred.

Signed-off-by: Fraser Tweedale <[email protected]>
  • Loading branch information
frasertweedale committed Nov 24, 2021
1 parent 20feb5d commit 459de29
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 0 deletions.
32 changes: 32 additions & 0 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package systemd

import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
Expand Down Expand Up @@ -288,6 +290,36 @@ func (m *unifiedManager) Apply(pid int) error {
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
return err
}

if c.OwnerUID != nil {
// The kernel exposes a list of files that should chowned to
// the delegate uid in /sys/kernel/cgroup/delegate. If the
// file is not present (Linux < 4.15), use the initial
// values mentioned in in cgroups(7).
filesToChown := []string{"."} // the directory itself must be chowned
cgroupDelegateFile := "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err == nil {
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
} else {
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
}

for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
if err != nil {
return err
}
}
}

return nil
}

Expand Down
7 changes: 7 additions & 0 deletions libcontainer/configs/cgroup_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ type Cgroup struct {

// Rootless tells if rootless cgroups should be used.
Rootless bool

// The host UID that should own the cgroup, or nil to accept
// the default ownership. This should only be set when the
// cgroupfs is to be mounted read/write.
// Not all cgroup manager implementations support changing
// the ownership.
OwnerUID *int `json:"owner_uid,omitempty"`
}

type Resources struct {
Expand Down
43 changes: 43 additions & 0 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
}
}

// Set the host UID that should own the container's cgroup.
// This must be performed after setupUserNamespace, so that
// config.HostRootUID() returns the correct result.
//
// Only set it if the container will have its own cgroup
// namespace and the cgroupfs will be mounted read/write.
//
hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == ""
hasRwCgroupfs := false
if hasCgroupNS {
for _, m := range config.Mounts {
if m.Source == "cgroup" && m.Destination == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
hasRwCgroupfs = true
break
}
}
}
processUid := 0
if spec.Process != nil {
// Chown the cgroup to the UID running the process,
// which is not necessarily UID 0 in the container
// namespace (e.g., an unprivileged UID in the host
// user namespace).
processUid = int(spec.Process.User.UID)
}
if hasCgroupNS && hasRwCgroupfs {
ownerUid, err := config.HostUID(processUid)
// There are two error cases; we can ignore both.
//
// 1. uidMappings is unset. Either there is no user
// namespace (fine), or it is an error (which is
// checked elsewhere).
//
// 2. The user is unmapped in the user namespace. This is an
// unusual configuration and might be an error. But it too
// will be checked elsewhere, so we can ignore it here.
//
if err == nil {
config.Cgroups.OwnerUID = &ownerUid
}
}

if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
config.NoNewPrivileges = spec.Process.NoNewPrivileges
Expand Down
80 changes: 80 additions & 0 deletions tests/integration/cgroups.bats
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,86 @@ function setup() {
#
}

@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" {
requires root cgroups_v2 systemd

set_cgroups_path

# configure a user namespace
update_config ' .linux.namespaces += [{"type": "user"}]
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
'

# chown test temp dir to allow host user to read it
chown 100000 "$ROOT"

# chown rootfs to allow host user to mkdir mount points
chown 100000 "$ROOT"/bundle/rootfs

runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
}

@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" {
requires root cgroups_v2 systemd

set_cgroups_path
set_cgroup_mount_writable

# configure a user namespace
update_config ' .linux.namespaces += [{"type": "user"}]
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
'

# inherit cgroup namespace (remove cgroup from namespaces list)
update_config '.linux.namespaces |= map(select(.type != "cgroup"))'

# chown test temp dir to allow host user to read it
chown 100000 "$ROOT"

# chown rootfs to allow host user to mkdir mount points
chown 100000 "$ROOT"/bundle/rootfs

runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
}

@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" {
requires root cgroups_v2 systemd

set_cgroups_path
set_cgroup_mount_writable

# configure a user namespace
update_config ' .linux.namespaces += [{"type": "user"}]
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
'

# chown test temp dir to allow host user to read it
chown 100000 "$ROOT"

# chown rootfs to allow host user to mkdir mount points
chown 100000 "$ROOT"/bundle/rootfs

runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
[ "$status" -eq 0 ]

runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
[ "$status" -eq 0 ]
[ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace)
}

@test "runc run (cgroup v1 + unified resources should fail)" {
requires root cgroups_v1

Expand Down

0 comments on commit 459de29

Please sign in to comment.