Skip to content

Commit

Permalink
Support Rootless Docker, with vanilla Kubernetes
Browse files Browse the repository at this point in the history
Tested with vanilla Kubernetes v1.20.4

Signed-off-by: Akihiro Suda <[email protected]>
  • Loading branch information
AkihiroSuda committed Mar 5, 2021
1 parent ca88477 commit 85d51d8
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 13 deletions.
2 changes: 2 additions & 0 deletions images/base/files/etc/containerd/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ version = 2
tolerate_missing_hugepages_controller = true
# explicitly use default snapshotter so we can sed it in entrypoint
snapshotter = "overlayfs"
# restrict_oom_score_adj needs to be true when running inside UserNS (rootless)
restrict_oom_score_adj = false
112 changes: 106 additions & 6 deletions images/base/files/usr/local/bin/entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,97 @@ set -o errexit
set -o nounset
set -o pipefail

# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host.
# Otherwise we are in a non-initial user namespace.
# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118
userns=""
if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
userns="1"
echo 'INFO: running in a user namespace (experimental)'
fi

validate_userns() {
if [[ -z "${userns}" ]]; then
return
fi

local nofile_hard
nofile_hard="$(ulimit -Hn)"
local nofile_hard_expected="64000"
if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
fi

if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2
exit 1
fi
for f in cpu memory pids; do
if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
echo "ERROR: UserNS: $f controller needs to be delegated" >&2
exit 1
fi
done
}

fake_file_with_content(){
local path="$1"
local content="$2"
local base="/run/fake"
local fake_path="${base}/${path}"
mkdir -p "$(dirname "${fake_path}")"
echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)"
echo "${content}" > "${fake_path}"
mount --bind "${fake_path}" "${path}"
}

fake_sysctl() {
local key="$1"
local key_slash
# shellcheck disable=SC2001
key_slash="$(echo "${key}" | sed -e s@\\.@/@g)"
local path="/proc/sys/${key_slash}"
if [[ -f "${path}" ]]; then
local content
content="$(cat "${path}")"
fake_file_with_content "${path}" "${content}"
fi
}

configure_containerd() {
# we need to switch to the 'native' snapshotter on zfs
if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi

# userns (rootless) configs
if [[ -n "$userns" ]]; then
# Adjust oomScoreAdj
sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml

# mounting overlayfs inside userns requires patching kernel.
# Ubuntu kernel is patched by default.
# Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`.
local tmp
tmp=$(mktemp -d)
mkdir -p "${tmp}"/{l,u,w,m}
if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then
umount "${tmp}/m"
else
echo 'INFO: UserNS: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs'
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi
rm -rf "${tmp}"

# To run vanilla kubelet inside UserNS, we need to fake several unwritable sysctl to be writable.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream.
fake_sysctl "vm.overcommit_memory"
fake_sysctl "vm.panic_on_oom"
fake_sysctl "kernel.panic"
fake_sysctl "kernel.panic_on_oops"
fake_sysctl "kernel.keys.root_maxkeys"
fake_sysctl "kernel.keys.root_maxbytes"
fi
}

configure_proxy() {
Expand Down Expand Up @@ -50,12 +136,16 @@ fix_mount() {
sync
fi

echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
mount -o remount,ro /sys
if [[ -z "${userns}" ]]; then
echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
#
# This step is skipped when running inside UserNS, because it fails with EACCES.
mount -o remount,ro /sys
fi

echo 'INFO: making mounts shared' >&2
# for mount propagation
Expand Down Expand Up @@ -212,6 +302,13 @@ fix_kmsg() {
else
echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2
fi
elif [[ -n "${userns}" ]]; then
if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then
if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then
echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2
mount --bind /dev/null /dev/kmsg
fi
fi
fi
}

Expand Down Expand Up @@ -299,6 +396,9 @@ enable_network_magic(){
fi
}

# validate state
validate_userns

# run pre-init fixups
# NOTE: it's important that we do configure* first in this order to avoid races
configure_containerd
Expand Down
6 changes: 6 additions & 0 deletions pkg/cluster/internal/create/actions/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ func (a *Action) Execute(ctx *actions.ActionContext) error {
ctx.Status.Start("Writing configuration 📜")
defer ctx.Status.End(false)

providerInfo, err := ctx.Provider.Info()
if err != nil {
return err
}

allNodes, err := ctx.Nodes()
if err != nil {
return err
Expand Down Expand Up @@ -76,6 +81,7 @@ func (a *Action) Execute(ctx *actions.ActionContext) error {
IPv6: ctx.Config.Networking.IPFamily == "ipv6",
FeatureGates: ctx.Config.FeatureGates,
RuntimeConfig: ctx.Config.RuntimeConfig,
RootlessProvider: providerInfo.Rootless,
}

kubeadmConfigPlusPatches := func(node nodes.Node, data kubeadm.ConfigData) func() error {
Expand Down
16 changes: 15 additions & 1 deletion pkg/cluster/internal/kubeadm/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ type ConfigData struct {
// These auto-generated fields are available to Config templates,
// but not meant to be set by hand
DerivedConfigData

// Provider is running with rootless mode, so kube-proxy needs to be configured
// not to fail on sysctl error.
RootlessProvider bool
}

// DerivedConfigData fields are automatically derived by
Expand Down Expand Up @@ -385,7 +389,14 @@ mode: "{{ .KubeProxyMode }}"
{{end}}{{end}}
iptables:
minSyncPeriod: 1s
{{end}}
{{if .RootlessProvider}}conntrack:
# Skip setting sysctl value "net.netfilter.nf_conntrack_max"
maxPerCore: 0
# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_established"
tcpEstablishedTimeout: 0s
# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close"
tcpCloseWaitTimeout: 0s
{{end}}{{end}}
`

// Config returns a kubeadm config generated from config data, in particular
Expand All @@ -404,6 +415,9 @@ func Config(data ConfigData) (config string, err error) {
// assume the latest API version, then fallback if the k8s version is too low
templateSource := ConfigTemplateBetaV2
if ver.LessThan(version.MustParseSemantic("v1.15.0")) {
if data.RootlessProvider {
return "", errors.Errorf("version %q is not compatible with rootless provider", ver)
}
templateSource = ConfigTemplateBetaV1
}

Expand Down
32 changes: 32 additions & 0 deletions pkg/cluster/internal/providers/docker/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package docker

import (
"encoding/csv"
"encoding/json"
"fmt"
"net"
"os"
Expand Down Expand Up @@ -281,3 +283,33 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error {
errs = append(errs, errors.AggregateConcurrent(fns))
return errors.NewAggregate(errs)
}

// Info returns the provider info.
func (p *provider) Info() (*providers.ProviderInfo, error) {
cmd := exec.Command("docker", "info", "--format", "{{json .SecurityOptions}}")
out, err := exec.Output(cmd)
if err != nil {
return nil, errors.Wrap(err, "failed to get docker info")
}
var securityOptions []string
if err := json.Unmarshal(out, &securityOptions); err != nil {
return nil, err
}
var info providers.ProviderInfo
for _, o := range securityOptions {
// o is like "name=seccomp,profile=default", or "name=rootless",
csvReader := csv.NewReader(strings.NewReader(o))
sliceSlice, err := csvReader.ReadAll()
if err != nil {
return nil, err
}
for _, f := range sliceSlice {
for _, ff := range f {
if ff == "name=rootless" {
info.Rootless = true
}
}
}
}
return &info, nil
}
14 changes: 8 additions & 6 deletions pkg/cluster/internal/providers/podman/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,6 @@ func (p *provider) Provision(status *cli.Status, cfg *config.Cluster) (err error
return err
}

// kind doesn't work with podman rootless, surface an error
if os.Geteuid() != 0 {
p.logger.Errorf("podman provider does not work properly in rootless mode")
os.Exit(1)
}

// TODO: validate cfg
// ensure node images are pulled before actually provisioning
if err := ensureNodeImages(p.logger, status, cfg); err != nil {
Expand Down Expand Up @@ -350,3 +344,11 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error {
errs = append(errs, errors.AggregateConcurrent(fns))
return errors.NewAggregate(errs)
}

// Info returns the provider info.
func (p *provider) Info() (*providers.ProviderInfo, error) {
info := &providers.ProviderInfo{
Rootless: os.Geteuid() != 0,
}
return info, nil
}
7 changes: 7 additions & 0 deletions pkg/cluster/internal/providers/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,11 @@ type Provider interface {
GetAPIServerInternalEndpoint(cluster string) (string, error)
// CollectLogs will populate dir with cluster logs and other debug files
CollectLogs(dir string, nodes []nodes.Node) error
// Info returns the provider info
Info() (*ProviderInfo, error)
}

// ProviderInfo is the info of the provider
type ProviderInfo struct {
Rootless bool
}
59 changes: 59 additions & 0 deletions site/content/docs/user/rootless.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
---
title: "Running kind with Rootless Docker"
menu:
main:
parent: "user"
identifier: "rootless"
weight: 3
---
Starting with kind 0.11.0, [Rootless Docker](https://docs.docker.com/go/rootless/) and [Rootless Podman](https://github.com/containers/podman/blob/master/docs/tutorials/rootless_tutorial.md) can be used as the node provider of kind.

## Provider requirements
- Docker: 20.10 or later
- Podman: 3.0 or later

## Host requirements
The host needs to be running with cgroup v2.

cgroup v2 is enabled by default on Fedora.
On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and
running `sudo update-grub`.

Also, depending on the host configuration, the following steps might be needed:

- Create `/etc/systemd/system/[email protected]/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`:
```ini
[Service]
Delegate=yes
```

- Create `/etc/modules-load.d/iptables.conf` with the following content:
```
iptables_nat
ip6tables_nat
```

## Restrictions

The restrictions of Rootless Docker apply to kind clusters as well.

e.g.
- OverlayFS cannot be used unless the host is using kernel >= 5.11, or Ubuntu/Debian kernel
- Cannot mount block storages
- Cannot mount NFS

## Creating a kind cluster with Rootless Docker

To create a kind cluster with Rootless Docker, just run:
```console
$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock
$ kind create cluster
```

To create a kind cluster with Rootless Podman, just run:
```console
$ KIND_EXPERIMENTAL_PROVIDER=podman kind create cluster
```

## Tips
- To enable OOM watching, allow `dmesg` by running `sysctl -w kernel.dmesg_restrict=0`.

0 comments on commit 85d51d8

Please sign in to comment.