Skip to content

Commit

Permalink
[Carry 2535] rootless: support detach-netns mode
Browse files Browse the repository at this point in the history
When RootlessKit v2.0 (rootless-containers/rootlesskit PR 379) is
installed, `containerd-rootless.sh` launches it with `--detach-netns`
so as to run the daemon in the host network namespace.

This will enable:
- Accelerated (and deflaked) `nerdctl pull`, `nerdctl push`, `nerdctl build`, etc
- Proper support for `nerdctl pull 127.0.0.1:.../...`
- Proper support for `nerdctl run --net=host`

Replaces Fahed Dorgaa's PR 2535

Co-authored-by: fahed dorgaa <[email protected]>
Signed-off-by: Akihiro Suda <[email protected]>
  • Loading branch information
AkihiroSuda and fahedouch committed Jan 31, 2024
1 parent 328497a commit 4892364
Show file tree
Hide file tree
Showing 15 changed files with 240 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ In addition to containerd, the following components should be installed:
- [BuildKit](https://github.com/moby/buildkit) (OPTIONAL): for using `nerdctl build`. BuildKit daemon (`buildkitd`) needs to be running. See also [the document about setting up BuildKit](./docs/build.md).
- v0.11.0 or later is highly recommended. Some features, such as pruning caches with `nerdctl system prune`, do not work with older versions.
- [RootlessKit](https://github.com/rootless-containers/rootlesskit) and [slirp4netns](https://github.com/rootless-containers/slirp4netns) (OPTIONAL): for [Rootless mode](./docs/rootless.md)
- RootlessKit needs to be v0.10.0 or later. v0.14.1 or later is recommended.
- RootlessKit needs to be v0.10.0 or later. v2.0.0 or later is recommended.
- slirp4netns needs to be v0.4.0 or later. v1.1.7 or later is recommended.

These dependencies are included in `nerdctl-full-<VERSION>-<OS>-<ARCH>.tar.gz`, but not included in `nerdctl-<VERSION>-<OS>-<ARCH>.tar.gz`.
Expand Down
5 changes: 5 additions & 0 deletions cmd/nerdctl/container_run_network_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,11 @@ func TestRunContainerWithMACAddress(t *testing.T) {
}

func TestHostsFileMounts(t *testing.T) {
if rootlessutil.IsRootless() {
if detachedNetNS, _ := rootlessutil.DetachedNetNS(); detachedNetNS != "" {
t.Skip("/etc/hosts is not writable")
}
}
base := testutil.NewBase(t)

base.Cmd("run", "--rm", testutil.CommonImage,
Expand Down
23 changes: 15 additions & 8 deletions cmd/nerdctl/ipfs_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,21 @@ func TestIPFSAddress(t *testing.T) {

func runIPFSDaemonContainer(t *testing.T, base *testutil.Base) (ipfsAddress string, done func()) {
name := "test-ipfs-address"
base.Cmd("run", "-d", "--name", name, "--entrypoint=/bin/sh", testutil.KuboImage, "-c", "ipfs init && ipfs config Addresses.API /ip4/0.0.0.0/tcp/5001 && ipfs daemon --offline").AssertOK()
iplines := base.Cmd("inspect", name, "-f", "'{{json .NetworkSettings.IPAddress}}'").OutLines()
t.Logf("IPAddress=%v", iplines)
assert.Equal(t, len(iplines), 2)
matches := iplineRegexp.FindStringSubmatch(iplines[0])
t.Logf("ip address matches=%v", matches)
assert.Equal(t, len(matches), 2)
ipfsaddr := fmt.Sprintf("/ip4/%s/tcp/5001", matches[1])
var ipfsaddr string
if detachedNetNS, _ := rootlessutil.DetachedNetNS(); detachedNetNS != "" {
// detached-netns mode can't use .NetworkSettings.IPAddress, because the daemon and CNI has different network namespaces
base.Cmd("run", "-d", "-p", "127.0.0.1:5999:5999", "--name", name, "--entrypoint=/bin/sh", testutil.KuboImage, "-c", "ipfs init && ipfs config Addresses.API /ip4/0.0.0.0/tcp/5999 && ipfs daemon --offline").AssertOK()
ipfsaddr = "/ip4/127.0.0.1/tcp/5999"
} else {
base.Cmd("run", "-d", "--name", name, "--entrypoint=/bin/sh", testutil.KuboImage, "-c", "ipfs init && ipfs config Addresses.API /ip4/0.0.0.0/tcp/5001 && ipfs daemon --offline").AssertOK()
iplines := base.Cmd("inspect", name, "-f", "'{{json .NetworkSettings.IPAddress}}'").OutLines()
t.Logf("IPAddress=%v", iplines)
assert.Equal(t, len(iplines), 2)
matches := iplineRegexp.FindStringSubmatch(iplines[0])
t.Logf("ip address matches=%v", matches)
assert.Equal(t, len(matches), 2)
ipfsaddr = fmt.Sprintf("/ip4/%s/tcp/5001", matches[1])
}
return ipfsaddr, func() {
base.Cmd("kill", "test-ipfs-address").AssertOK()
base.Cmd("rm", "test-ipfs-address").AssertOK()
Expand Down
30 changes: 30 additions & 0 deletions docs/rootless.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,36 @@ $ nerdctl run -it --rm -p 8080:80 --label nerdctl/bypass4netns=true alpine

More detail is available at [https://github.com/rootless-containers/bypass4netns/blob/master/README.md](https://github.com/rootless-containers/bypass4netns/blob/master/README.md)

## Configuring RootlessKit

Rootless containerd recognizes the following environment variables to configure the behavior of [RootlessKit](https://github.com/rootless-containers/rootlesskit):

* `CONTAINERD_ROOTLESS_ROOTLESSKIT_STATE_DIR=DIR`: the rootlesskit state dir. Defaults to `$XDG_RUNTIME_DIR/containerd-rootless`.
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_NET=(slirp4netns|vpnkit|lxc-user-nic)`: the rootlesskit network driver. Defaults to "slirp4netns" if slirp4netns (>= v0.4.0) is installed. Otherwise defaults to "vpnkit".
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_MTU=NUM`: the MTU value for the rootlesskit network driver. Defaults to 65520 for slirp4netns, 1500 for other drivers.
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER=(builtin|slirp4netns)`: the rootlesskit port driver. Defaults to "builtin".
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX=(auto|true|false)`: whether to protect slirp4netns with a dedicated mount namespace. Defaults to "auto".
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP=(auto|true|false)`: whether to protect slirp4netns with seccomp. Defaults to "auto".
* `CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS=(auto|true|false)`: whether to launch rootlesskit with the "detach-netns" mode.
Defaults to "auto", which is resolved to "true" if RootlessKit >= 2.0 is installed.
The "detached-netns" mode accelerates `nerdctl (pull|push|build)` and enables `nerdctl run --net=host`,
however, there is a relatively minor drawback with BuildKit prior to v0.13:
the host loopback IP address (127.0.0.1) and abstract sockets are exposed to Dockerfile's "RUN" instructions during `nerdctl build` (not `nerdctl run`).
The drawback is fixed in BuildKit v0.13. Upgrading from a prior version of BuildKit needs removing the old systemd unit:
`containerd-rootless-setuptool.sh uninstall-buildkit && rm -f ~/.config/buildkit/buildkitd.toml`

To set these variables, create `~/.config/systemd/user/containerd.service.d/override.conf` as follows:
```ini
[Service]
Environment=CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS="false"
```

And then run the following commands:
```bash
systemctl --user daemon-reload
systemctl --user restart containerd
```

## Troubleshooting

### Hint to Fedora users
Expand Down
15 changes: 14 additions & 1 deletion extras/rootless/containerd-rootless-setuptool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,13 @@ propagate_env_from() {
cmd_entrypoint_nsenter() {
# No need to call init()
pid=$(cat "$XDG_RUNTIME_DIR/containerd-rootless/child_pid")
n=""
# If RootlessKit is running with `--detach-netns` mode, we do NOT enter the detached netns here
if [ ! -e "$XDG_RUNTIME_DIR/containerd-rootless/netns" ]; then
n="-n"
fi
propagate_env_from "$pid" ROOTLESSKIT_STATE_DIR ROOTLESSKIT_PARENT_EUID ROOTLESSKIT_PARENT_EGID
exec nsenter --no-fork --wd="$(pwd)" --preserve-credentials -m -n -U -t "$pid" -- "$@"
exec nsenter --no-fork --wd="$(pwd)" --preserve-credentials -m $n -U -t "$pid" -- "$@"
}

show_systemd_error() {
Expand Down Expand Up @@ -266,6 +271,10 @@ cmd_entrypoint_install_buildkit() {
exit 1
fi
BUILDKITD_FLAG="--oci-worker=true --oci-worker-rootless=true --containerd-worker=false"
if buildkitd --help | grep -q bridge; then
# Available since BuildKit v0.13
BUILDKITD_FLAG="${BUILDKITD_FLAG} --oci-worker-net=bridge"
fi
cat <<-EOT | install_systemd_unit "${SYSTEMD_BUILDKIT_UNIT}"
[Unit]
Description=BuildKit (Rootless)
Expand Down Expand Up @@ -307,6 +316,10 @@ cmd_entrypoint_install_buildkit_containerd() {
if [ -n "${CONTAINERD_SNAPSHOTTER:-}" ]; then
BUILDKITD_FLAG="${BUILDKITD_FLAG} --containerd-worker-snapshotter=${CONTAINERD_SNAPSHOTTER}"
fi
if buildkitd --help | grep -q bridge; then
# Available since BuildKit v0.13
BUILDKITD_FLAG="${BUILDKITD_FLAG} --containerd-worker-net=bridge"
fi
cat <<-EOT | install_systemd_unit "${UNIT_NAME}"
[Unit]
Description=BuildKit (Rootless)
Expand Down
31 changes: 30 additions & 1 deletion extras/rootless/containerd-rootless.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# External dependencies:
# * newuidmap and newgidmap needs to be installed.
# * /etc/subuid and /etc/subgid needs to be configured for the current user.
# * RootlessKit (>= v0.10.0) needs to be installed. RootlessKit >= v0.14.1 is recommended.
# * RootlessKit (>= v0.10.0) needs to be installed. RootlessKit >= v2.0.0 is recommended.
# * Either one of slirp4netns (>= v0.4.0), VPNKit, lxc-user-nic needs to be installed. slirp4netns >= v1.1.7 is recommended.
#
# Recognized environment variables:
Expand All @@ -38,6 +38,15 @@
# * CONTAINERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER=(builtin|slirp4netns): the rootlesskit port driver. Defaults to "builtin".
# * CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX=(auto|true|false): whether to protect slirp4netns with a dedicated mount namespace. Defaults to "auto".
# * CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP=(auto|true|false): whether to protect slirp4netns with seccomp. Defaults to "auto".
# * CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS=(auto|true|false): whether to launch rootlesskit with the "detach-netns" mode.
# Defaults to "auto", which is resolved to "true" if RootlessKit >= 2.0 is installed.
# The "detached-netns" mode accelerates `nerdctl (pull|push|build)` and enables `nerdctl run --net=host`,
# however, there is a relatively minor drawback with BuildKit prior to v0.13:
# the host loopback IP address (127.0.0.1) and abstract sockets are exposed to Dockerfile's "RUN" instructions during `nerdctl build` (not `nerdctl run`).
# The drawback is fixed in BuildKit v0.13. Upgrading from a prior version of BuildKit needs removing the old systemd unit:
# `containerd-rootless-setuptool.sh uninstall-buildkit && rm -f ~/.config/buildkit/buildkitd.toml`

# See also: https://github.com/containerd/nerdctl/blob/main/docs/rootless.md#configuring-rootlesskit

set -e
if ! [ -w $XDG_RUNTIME_DIR ]; then
Expand Down Expand Up @@ -69,6 +78,7 @@ if [ -z $_CONTAINERD_ROOTLESS_CHILD ]; then
: "${CONTAINERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER:=builtin}"
: "${CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX:=auto}"
: "${CONTAINERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP:=auto}"
: "${CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS:=auto}"
net=$CONTAINERD_ROOTLESS_ROOTLESSKIT_NET
mtu=$CONTAINERD_ROOTLESS_ROOTLESSKIT_MTU
if [ -z $net ]; then
Expand Down Expand Up @@ -107,6 +117,25 @@ if [ -z $_CONTAINERD_ROOTLESS_CHILD ]; then
export _CONTAINERD_ROOTLESS_SELINUX
fi
fi

case "$CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS" in
auto)
if rootlesskit --help | grep -qw -- "--detach-netns"; then
CONTAINERD_ROOTLESS_ROOTLESSKIT_FLAGS=--detach-netns $CONTAINERD_ROOTLESS_ROOTLESSKIT_FLAGS
fi
;;
1 | true)
CONTAINERD_ROOTLESS_ROOTLESSKIT_FLAGS=--detach-netns $CONTAINERD_ROOTLESS_ROOTLESSKIT_FLAGS
;;
0 | false)
# NOP
;;
*)
echo "Unknown CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS value: $CONTAINERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS"
exit 1
;;
esac

# Re-exec the script via RootlessKit, so as to create unprivileged {user,mount,network} namespaces.
#
# --copy-up allows removing/creating files in the directories by creating tmpfs and symlinks
Expand Down
24 changes: 24 additions & 0 deletions pkg/cmd/container/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"github.com/containerd/nerdctl/v2/pkg/namestore"
"github.com/containerd/nerdctl/v2/pkg/platformutil"
"github.com/containerd/nerdctl/v2/pkg/referenceutil"
"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
"github.com/containerd/nerdctl/v2/pkg/strutil"
dockercliopts "github.com/docker/cli/opts"
dockeropts "github.com/docker/docker/opts"
Expand Down Expand Up @@ -418,6 +419,29 @@ func GenerateLogURI(dataStore string) (*url.URL, error) {
}

func withNerdctlOCIHook(cmd string, args []string) (oci.SpecOpts, error) {
if rootlessutil.IsRootless() {
detachedNetNS, err := rootlessutil.DetachedNetNS()
if err != nil {
return nil, fmt.Errorf("failed to check whether RootlessKit is running with --detach-netns: %w", err)
}
if detachedNetNS != "" {
// Rewrite {cmd, args} if RootlessKit is running with --detach-netns, so that the hook can gain
// CAP_NET_ADMIN in the namespaces.
// - Old:
// - cmd: "/usr/local/bin/nerdctl"
// - args: {"--data-root=/foo", "internal", "oci-hook"}
// - New:
// - cmd: "/usr/bin/nsenter"
// - args: {"-n/run/user/1000/containerd-rootless/netns", "-F", "--", "/usr/local/bin/nerdctl", "--data-root=/foo", "internal", "oci-hook"}
oldCmd, oldArgs := cmd, args
cmd, err = exec.LookPath("nsenter")
if err != nil {
return nil, err
}
args = append([]string{"-n" + detachedNetNS, "-F", "--", oldCmd}, oldArgs...)
}
}

args = append([]string{cmd}, append(args, "internal", "oci-hook")...)
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
if s.Hooks == nil {
Expand Down
41 changes: 41 additions & 0 deletions pkg/containerutil/container_network_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"github.com/containerd/nerdctl/v2/pkg/mountutil"
"github.com/containerd/nerdctl/v2/pkg/netutil"
"github.com/containerd/nerdctl/v2/pkg/netutil/nettype"
"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
"github.com/containerd/nerdctl/v2/pkg/strutil"
"github.com/opencontainers/runtime-spec/specs-go"
)
Expand Down Expand Up @@ -461,9 +462,49 @@ func (m *hostNetworkManager) ContainerNetworkingOpts(_ context.Context, containe
}
}

if rootlessutil.IsRootless() {
detachedNetNS, err := rootlessutil.DetachedNetNS()
if err != nil {
return nil, nil, fmt.Errorf("failed to check whether RootlessKit is running with --detach-netns: %w", err)
}
if detachedNetNS != "" {
// For rootless + host netns, we can't mount sysfs.
// We can't (non-recursively) bind mount /sys, either.
//
// TODO: consider to just rbind /sys from the host with rro,
// when rro is available (kernel >= 5.12, runc >= 1.1).
//
// Relevant: https://github.com/moby/buildkit/blob/v0.12.4/util/rootless/specconv/specconv_linux.go#L15-L34
specs = append(specs, withRemoveSysfs)
}
}

return specs, cOpts, nil
}

func withRemoveSysfs(_ context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {
var hasSysfs bool
for _, mount := range s.Mounts {
if mount.Type == "sysfs" {
hasSysfs = true
break
}
}
if !hasSysfs {
// NOP, as the user has specified a custom /sys mount
return nil
}
var mounts []specs.Mount // nolint: prealloc
for _, mount := range s.Mounts {
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
mounts = append(mounts, mount)
}
s.Mounts = mounts
return nil
}

// types.NetworkOptionsManager implementation for CNI networking settings.
// This is a more specialized and OS-dependendant networking model so this
// struct provides different implementations on different platforms.
Expand Down
1 change: 1 addition & 0 deletions pkg/defaults/defaults_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func HostsDirs() []string {

// HostGatewayIP returns the non-loop-back host ip if available and returns empty string if running into error.
func HostGatewayIP() string {
// no need to use [rootlessutil.WithDetachedNetNSIfAny] here
addrs, err := net.InterfaceAddrs()
if err != nil {
return ""
Expand Down
15 changes: 9 additions & 6 deletions pkg/netutil/netutil_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/Masterminds/semver/v3"
"github.com/containerd/log"
"github.com/containerd/nerdctl/v2/pkg/defaults"
"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
"github.com/containerd/nerdctl/v2/pkg/strutil"
"github.com/containerd/nerdctl/v2/pkg/systemutil"
"github.com/mitchellh/mapstructure"
Expand Down Expand Up @@ -322,11 +323,13 @@ func guessFirewallPluginVersion(stderr string) (*semver.Version, error) {
}

func removeBridgeNetworkInterface(netIf string) error {
link, err := netlink.LinkByName(netIf)
if err == nil {
if err := netlink.LinkDel(link); err != nil {
return fmt.Errorf("failed to remove network interface %s: %v", netIf, err)
return rootlessutil.WithDetachedNetNSIfAny(func() error {
link, err := netlink.LinkByName(netIf)
if err == nil {
if err := netlink.LinkDel(link); err != nil {
return fmt.Errorf("failed to remove network interface %s: %v", netIf, err)
}
}
}
return nil
return nil
})
}
10 changes: 8 additions & 2 deletions pkg/netutil/subnet/subnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,17 @@ package subnet
import (
"fmt"
"net"

"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
)

func GetLiveNetworkSubnets() ([]*net.IPNet, error) {
addrs, err := net.InterfaceAddrs()
if err != nil {
var addrs []net.Addr
if err := rootlessutil.WithDetachedNetNSIfAny(func() error {
var err2 error
addrs, err2 = net.InterfaceAddrs()
return err2
}); err != nil {
return nil, err
}
nets := make([]*net.IPNet, 0, len(addrs))
Expand Down
15 changes: 14 additions & 1 deletion pkg/rootlessutil/parent_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@ func ParentMain(hostGatewayIP string) error {
return err
}

detachedNetNSPath, err := detachedNetNS(stateDir)
if err != nil {
return err
}
detachNetNSMode := detachedNetNSPath != ""
log.L.Debugf("RootlessKit detach-netns mode: %v", detachNetNSMode)
if err != nil {
return err
}

wd, err := os.Getwd()
if err != nil {
return err
Expand All @@ -92,10 +102,13 @@ func ParentMain(hostGatewayIP string) error {
"-r/", // root dir (busybox nsenter wants this to be explicitly specified),
"-w" + wd, // work dir
"--preserve-credentials",
"-m", "-n", "-U",
"-m", "-U",
"-t", strconv.Itoa(childPid),
"-F", // no fork
}
if !detachNetNSMode {
args = append(args, "-n")
}
args = append(args, os.Args...)
log.L.Debugf("rootless parent main: executing %q with %v", arg0, args)

Expand Down
Loading

0 comments on commit 4892364

Please sign in to comment.