From 0c8e2cc602bab3c8c1127b50aa26fa6f6954e58b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 21 Nov 2023 00:36:28 +1100 Subject: [PATCH] *: actually support joining a userns with a new container (This is a cherry-pick of 1912d5988bbb379189ea9ceb2e03945738c513dc.) Our handling for name space paths with user namespaces has been broken for a long time. In particular, the need to parse /proc/self/*id_map in quite a few places meant that we would treat userns configurations that had a namespace path as if they were a userns configuration without mappings, resulting in errors. The primary issue was down to the id translation helper functions, which could only handle configurations that had explicit mappings. Obviously, when joining a user namespace we need to map the ids but figuring out the correct mapping is non-trivial in comparison. In order to get the mapping, you need to read /proc//*id_map of a process inside the userns -- while most userns paths will be of the form /proc//ns/user (and we have a fast-path for this case), this is not guaranteed and thus it is necessary to spawn a process inside the container and read its /proc//*id_map files in the general case. As Go does not allow us spawn a subprocess into a target userns, we have to use CGo to fork a sub-process which does the setns(2). To be honest, this is a little dodgy in regards to POSIX signal-safety(7) but since we do no allocations and we are executing in the forked context from a Go program (not a C program), it should be okay. The other alternative would be to do an expensive re-exec (a-la nsexec which would make several other bits of runc more complicated), or to use nsenter(1) which might not exist on the system and is less than ideal. Because we need to logically remap users quite a few times in runc (including in "runc init", where joining the namespace is not feasable), we cache the mapping inside the libcontainer config struct. A future patch will make sure that we stop allow invalid user configurations where a mapping is specified as well as a userns path to join. Finally, add an integration test to make sure we don't regress this again. Signed-off-by: Aleksa Sarai --- libcontainer/configs/validate/rootless.go | 31 ++-- libcontainer/specconv/spec_linux.go | 16 ++ libcontainer/userns/userns_maps.c | 79 ++++++++++ libcontainer/userns/userns_maps_linux.go | 171 ++++++++++++++++++++++ tests/integration/userns.bats | 83 ++++++++++- 5 files changed, 360 insertions(+), 20 deletions(-) create mode 100644 libcontainer/userns/userns_maps.c create mode 100644 libcontainer/userns/userns_maps_linux.go diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go index 9a6e5eb32a3..37c383366fe 100644 --- a/libcontainer/configs/validate/rootless.go +++ b/libcontainer/configs/validate/rootless.go @@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { return nil } -func hasIDMapping(id int, mappings []configs.IDMap) bool { - for _, m := range mappings { - if id >= m.ContainerID && id < m.ContainerID+m.Size { - return true - } - } - return false -} - func rootlessEUIDMappings(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWUSER) { return errors.New("rootless container requires user namespaces") } - - if len(config.UidMappings) == 0 { - return errors.New("rootless containers requires at least one UID mapping") - } - if len(config.GidMappings) == 0 { - return errors.New("rootless containers requires at least one GID mapping") + // We only require mappings if we are not joining another userns. + if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" { + if len(config.UidMappings) == 0 { + return errors.New("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return errors.New("rootless containers requires at least one GID mapping") + } } return nil } @@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(uid, config.UidMappings) { - return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") + if _, err := config.HostUID(uid); err != nil { + return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err) } } @@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(gid, config.GidMappings) { - return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") + if _, err := config.HostGID(gid); err != nil { + return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err) } } } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 1b358b24b4d..bef6a51df42 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/userns" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -939,6 +940,21 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { config.GidMappings = append(config.GidMappings, create(m)) } } + if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // Cache the current userns mappings in our configuration, so that we + // can calculate uid and gid mappings within runc. These mappings are + // never used for configuring the container if the path is set. + uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) + if err != nil { + return fmt.Errorf("failed to cache mappings for userns: %w", err) + } + config.UidMappings = uidMap + config.GidMappings = gidMap + logrus.WithFields(logrus.Fields{ + "uid_map": uidMap, + "gid_map": gidMap, + }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") + } rootUID, err := config.HostRootUID() if err != nil { return err diff --git a/libcontainer/userns/userns_maps.c b/libcontainer/userns/userns_maps.c new file mode 100644 index 00000000000..84f2c6188c3 --- /dev/null +++ b/libcontainer/userns/userns_maps.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* + * All of the code here is run inside an aync-signal-safe context, so we need + * to be careful to not call any functions that could cause issues. In theory, + * since we are a Go program, there are fewer restrictions in practice, it's + * better to be safe than sorry. + * + * The only exception is exit, which we need to call to make sure we don't + * return into runc. + */ + +void bail(int pipefd, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vdprintf(pipefd, fmt, args); + va_end(args); + + exit(1); +} + +int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) +{ + char buffer[4096] = { 0 }; + + pid_t child = fork(); + if (child != 0) + return child; + /* in child */ + + /* Join the target userns. */ + int nsfd = open(userns_path, O_RDONLY); + if (nsfd < 0) + bail(errfd, "open userns path %s failed: %m", userns_path); + + int err = setns(nsfd, CLONE_NEWUSER); + if (err < 0) + bail(errfd, "setns %s failed: %m", userns_path); + + close(nsfd); + + /* Pipe the requested file contents. */ + int fd = open(path, O_RDONLY); + if (fd < 0) + bail(errfd, "open %s in userns %s failed: %m", path, userns_path); + + int nread, ntotal = 0; + while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { + if (nread < 0) + bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); + ntotal += nread; + + int nwritten = 0; + while (nwritten < nread) { + int n = write(outfd, buffer, nread - nwritten); + if (n < 0) + bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", + nread - nwritten, path, nwritten); + nwritten += n; + } + if (nread != nwritten) + bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); + } + + close(fd); + close(outfd); + close(errfd); + + /* We must exit here, otherwise we would return into a forked runc. */ + exit(0); +} diff --git a/libcontainer/userns/userns_maps_linux.go b/libcontainer/userns/userns_maps_linux.go new file mode 100644 index 00000000000..d81290de2cb --- /dev/null +++ b/libcontainer/userns/userns_maps_linux.go @@ -0,0 +1,171 @@ +//go:build linux + +package userns + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +/* +#include +extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); +*/ +import "C" + +func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + var m configs.IDMap + line := scanner.Text() + if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { + return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) + } + ms = append(ms, m) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("parsing id map failed: %w", err) + } + return ms, nil +} + +// Do something equivalent to nsenter --user= cat , but more +// efficiently. Returns the contents of the requested file from within the user +// namespace. +func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) + } + defer rdr.Close() + defer wtr.Close() + + errRdr, errWtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) + } + defer errRdr.Close() + defer errWtr.Close() + + cNsPath := C.CString(nsPath) + defer C.free(unsafe.Pointer(cNsPath)) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) + + if childPid < 0 { + return nil, fmt.Errorf("failed to spawn fork for userns") + } else if childPid == 0 { + // this should never happen + panic("runc executing inside fork child -- unsafe state!") + } + + // We are in the parent -- close the write end of the pipe before reading. + wtr.Close() + output, err := io.ReadAll(rdr) + rdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn failed: %w", err) + } + + // Ditto for the error pipe. + errWtr.Close() + errOutput, err := io.ReadAll(errRdr) + errRdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) + } + errOutput = bytes.TrimSpace(errOutput) + + // Clean up the child. + child, err := os.FindProcess(int(childPid)) + if err != nil { + return nil, fmt.Errorf("could not find userns spawn process: %w", err) + } + state, err := child.Wait() + if err != nil { + return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) + } + if !state.Success() { + errStr := string(errOutput) + if errStr == "" { + errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) + } + return nil, fmt.Errorf("userns spawn: %s", errStr) + } else if len(errOutput) > 0 { + // We can just ignore weird output in the error pipe if the process + // didn't bail(), but for completeness output for debugging. + logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) + } + // The subprocess succeeded, return whatever it wrote to the pipe. + return output, nil +} + +func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { + var ( + pid int + extra rune + tryFastPath bool + ) + + // nsPath is usually of the form /proc//ns/user, which means that we + // already have a pid that is part of the user namespace and thus we can + // just use the pid to read from /proc//*id_map. + // + // Note that Sscanf doesn't consume the whole input, so we check for any + // trailing data with %c. That way, we can be sure the pattern matched + // /proc/$pid/ns/user _exactly_ iff n === 1. + if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { + tryFastPath = pid > 0 + } + + for _, mapType := range []struct { + name string + idMap *[]configs.IDMap + }{ + {"uid_map", &uidMap}, + {"gid_map", &gidMap}, + } { + var mapData []byte + + if tryFastPath { + path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) + data, err := os.ReadFile(path) + if err != nil { + // Do not error out here -- we need to try the slow path if the + // fast path failed. + logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) + } else { + mapData = data + } + } else { + logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) + } + + if mapData == nil { + // We have to actually join the namespace if we cannot take the + // fast path. The path is resolved with respect to the child + // process, so just use /proc/self. + data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) + if err != nil { + return nil, nil, err + } + mapData = data + } + idMap, err := parseIdmapData(mapData) + if err != nil { + return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) + } + *mapType.idMap = idMap + } + + return uidMap, gidMap, nil +} diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats index eca1e0ce58a..2329119a2fe 100644 --- a/tests/integration/userns.bats +++ b/tests/integration/userns.bats @@ -15,15 +15,25 @@ function setup() { mkdir -p rootfs/{proc,sys,tmp} mkdir -p rootfs/tmp/mount-{1,2} + to_umount_list="$(mktemp "$BATS_RUN_TMPDIR/userns-mounts.XXXXXX")" if [ "$ROOTLESS" -eq 0 ]; then update_config ' .linux.namespaces += [{"type": "user"}] | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] - | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + | .linux.gidMappings += [{"hostID": 200000, "containerID": 0, "size": 65534}] ' fi } function teardown() { teardown_bundle + + if [ -v to_umount_list ]; then + while read -r mount_path; do + umount -l "$mount_path" || : + rm -f "$mount_path" + done <"$to_umount_list" + rm -f "$to_umount_list" + unset to_umount_list + fi } @test "userns with simple mount" { @@ -83,3 +93,74 @@ function teardown() { runc exec test_busybox cat /sys/fs/cgroup/{pids,memory}/tasks [ "$status" -eq 0 ] } + +@test "userns join other container userns" { + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Configure our container to attach to the first container's userns. + target_pid="$(__runc state target_userns | jq .pid)" + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "/proc/'"$target_pid"'/ns/" + .type) else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} + +@test "userns join other container userns [bind-mounted nsfd]" { + requires root + + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Bind-mount the first containers userns nsfd to a different path, to + # exercise the non-fast-path (where runc has to join the userns to get the + # mappings). + target_pid="$(__runc state target_userns | jq .pid)" + userns_path=$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX") + mount --bind "/proc/$target_pid/ns/user" "$userns_path" + echo "$userns_path" >>"$to_umount_list" + + # Configure our container to attach to the first container's userns. + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +}