Skip to content

Commit

Permalink
chown cgroup to process uid in container namespace
Browse files Browse the repository at this point in the history
Delegating cgroups to the container enables more complex workloads,
including systemd-based workloads.  The OCI runtime-spec was
recently updated to explicitly admit such delegation, through
specification of cgroup ownership semantics:

  opencontainers/runtime-spec#1123

Pursuant to the updated OCI runtime-spec, change the ownership of
the container's cgroup directory and particular files therein, when
using cgroups v2 and when the cgroupfs is to be mounted read/write.

As a result of this change, systemd workloads can run in isolated
user namespaces on OpenShift when the sandbox's cgroupfs is mounted
read/write.

It might be possible to implement this feature in other cgroup
managers, but that work is deferred.

Signed-off-by: Fraser Tweedale <[email protected]>
  • Loading branch information
frasertweedale committed Nov 29, 2021
1 parent b09c164 commit 0cea9ff
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 0 deletions.
39 changes: 39 additions & 0 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package systemd

import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
Expand Down Expand Up @@ -288,9 +290,46 @@ func (m *unifiedManager) Apply(pid int) error {
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
return err
}

if c.OwnerUID != nil {
filesToChown, err := cgroupFilesToChown()
if err != nil {
return err
}

for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
if err != nil {
return err
}
}
}

return nil
}

// The kernel exposes a list of files that should be chowned to the delegate
// uid in /sys/kernel/cgroup/delegate. If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
filesToChown := []string{"."} // the directory itself must be chowned
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err == nil {
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
} else {
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
}
return filesToChown, nil
}

func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
Expand Down
7 changes: 7 additions & 0 deletions libcontainer/configs/cgroup_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ type Cgroup struct {

// Rootless tells if rootless cgroups should be used.
Rootless bool

// The host UID that should own the cgroup, or nil to accept
// the default ownership. This should only be set when the
// cgroupfs is to be mounted read/write.
// Not all cgroup manager implementations support changing
// the ownership.
OwnerUID *int `json:"owner_uid,omitempty"`
}

type Resources struct {
Expand Down
43 changes: 43 additions & 0 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
}
}
}

// Set the host UID that should own the container's cgroup.
// This must be performed after setupUserNamespace, so that
// config.HostRootUID() returns the correct result.
//
// Only set it if the container will have its own cgroup
// namespace and the cgroupfs will be mounted read/write.
//
hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == ""
hasRwCgroupfs := false
if hasCgroupNS {
for _, m := range config.Mounts {
if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
hasRwCgroupfs = true
break
}
}
}
processUid := 0
if spec.Process != nil {
// Chown the cgroup to the UID running the process,
// which is not necessarily UID 0 in the container
// namespace (e.g., an unprivileged UID in the host
// user namespace).
processUid = int(spec.Process.User.UID)
}
if hasCgroupNS && hasRwCgroupfs {
ownerUid, err := config.HostUID(processUid)
// There are two error cases; we can ignore both.
//
// 1. uidMappings is unset. Either there is no user
// namespace (fine), or it is an error (which is
// checked elsewhere).
//
// 2. The user is unmapped in the user namespace. This is an
// unusual configuration and might be an error. But it too
// will be checked elsewhere, so we can ignore it here.
//
if err == nil {
config.Cgroups.OwnerUID = &ownerUid
}
}

if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
config.NoNewPrivileges = spec.Process.NoNewPrivileges
Expand Down

0 comments on commit 0cea9ff

Please sign in to comment.