From 323225483ed87e20a0d2c32b4e02f49afc5a39a3 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Thu, 26 Aug 2021 15:01:07 +0900 Subject: [PATCH] Support Rootless Docker Requirements: - Install rootless Docker 20.10 or later, see https://rootlesscontaine.rs/getting-started/docker/ - Enable cgroup v2 delegation, see https://rootlesscontaine.rs/getting-started/common/cgroup2/ Usage: `minikube start --driver=docker --container-runtime=containerd`. The `--container-runtime` flag needs to be set to "containerd". CRI-O can be also supported later. Closes issue 10836 ("add support for rootless Docker"). Support for rootless Podman (issue 8719) is not covered in this commit. --- Code reading guide: - `deploy/kicbase/Dockerfile`: updated to install fuse-overlayfs and containerd-fuse-overlayfs, which is used instead of `overlayfs` snapshotter - `deploy/kicbase/entrypoint`: updated to verify cgroup v2 delegation. Mostly from https://github.com/kubernetes-sigs/kind/blob/8a83ee46b28a80ccd47a85e24294b3e149361947/images/base/files/usr/local/bin/entrypoint - `cmd/minikube/cmd/start_flags.go`: updated to set `KubeletInUserNamespace` feature gate when rootless - `pkg/drivers/kic/oci`: updated to use port forwarding, because rootless container IPs are not reachable from the host - `pkg/minikube/cruntime`: updated to generate `/etc/containerd/config.toml` with rootless support. Signed-off-by: Akihiro Suda --- cmd/minikube/cmd/start_flags.go | 28 ++++++++++ deploy/kicbase/Dockerfile | 19 ++++++- .../kicbase/containerd-fuse-overlayfs.service | 13 +++++ deploy/kicbase/entrypoint | 52 ++++++++++++++++--- hack/preload-images/generate.go | 2 +- pkg/drivers/kic/oci/network.go | 27 ++++++++++ pkg/drivers/kic/oci/oci.go | 3 ++ pkg/minikube/cruntime/containerd.go | 29 +++++++++-- pkg/minikube/cruntime/crio.go | 5 +- pkg/minikube/cruntime/cruntime.go | 2 +- pkg/minikube/cruntime/cruntime_test.go | 2 +- pkg/minikube/cruntime/docker.go | 5 +- pkg/minikube/driver/driver.go | 10 +++- pkg/minikube/node/start.go | 3 +- pkg/minikube/registry/drvs/docker/docker.go | 9 +--- site/content/en/docs/drivers/docker.md | 1 - .../en/docs/drivers/includes/docker_usage.inc | 21 ++++++++ 17 files changed, 204 insertions(+), 27 deletions(-) create mode 100644 deploy/kicbase/containerd-fuse-overlayfs.service diff --git a/cmd/minikube/cmd/start_flags.go b/cmd/minikube/cmd/start_flags.go index f87fd108e654..d042258fd8b3 100644 --- a/cmd/minikube/cmd/start_flags.go +++ b/cmd/minikube/cmd/start_flags.go @@ -492,9 +492,37 @@ func generateNewConfigFromFlags(cmd *cobra.Command, k8sVersion string, drvName s } } + if driver.IsKIC(drvName) { + si, err := oci.CachedDaemonInfo(drvName) + if err != nil { + exit.Message(reason.Usage, "Ensure your {{.driver_name}} is running and is healthy.", out.V{"driver_name": driver.FullName(drvName)}) + } + if si.Rootless { + if cc.KubernetesConfig.ContainerRuntime != "containerd" { + exit.Message(reason.Usage, "Container runtime must be set to \"containerd\" for rootless") + // TODO: support cri-o (https://kubernetes.io/docs/tasks/administer-cluster/kubelet-in-userns/#configuring-cri) + } + // KubeletInUserNamespace feature gate is essential for rootless driver. + // See https://kubernetes.io/docs/tasks/administer-cluster/kubelet-in-userns/ + cc.KubernetesConfig.FeatureGates = addFeatureGate(cc.KubernetesConfig.FeatureGates, "KubeletInUserNamespace=true") + } + } + return cc } +func addFeatureGate(featureGates, s string) string { + split := strings.Split(featureGates, ",") + m := make(map[string]struct{}, len(split)) + for _, v := range split { + m[v] = struct{}{} + } + if _, ok := m[s]; !ok { + split = append(split, s) + } + return strings.Join(split, ",") +} + func checkNumaCount(k8sVersion string) { if viper.GetInt(kvmNUMACount) < 1 || viper.GetInt(kvmNUMACount) > 8 { exit.Message(reason.Usage, "--kvm-numa-count range is 1-8") diff --git a/deploy/kicbase/Dockerfile b/deploy/kicbase/Dockerfile index 95a62efccfe0..d76aeecef4d4 100644 --- a/deploy/kicbase/Dockerfile +++ b/deploy/kicbase/Dockerfile @@ -30,6 +30,8 @@ RUN cd ./cmd/auto-pause/ && go build FROM ubuntu:focal-20210401 ARG BUILDKIT_VERSION="v0.9.0" +ARG FUSE_OVERLAYFS_VERSION="v1.7.1" +ARG CONTAINERD_FUSE_OVERLAYFS_VERSION="1.0.3" # copy in static files (configs, scripts) COPY deploy/kicbase/10-network-security.conf /etc/sysctl.d/10-network-security.conf @@ -113,7 +115,9 @@ RUN clean-install \ openssh-server \ dnsutils \ # libglib2.0-0 is required for conmon, which is required for podman - libglib2.0-0 + libglib2.0-0 \ + # fuse3 is required for fuse-overlayfs + fuse3 # install docker RUN sh -c "echo 'deb https://download.docker.com/linux/ubuntu focal stable' > /etc/apt/sources.list.d/docker.list" && \ @@ -121,6 +125,19 @@ RUN sh -c "echo 'deb https://download.docker.com/linux/ubuntu focal stable' > /e apt-key add - < docker.key && \ clean-install docker-ce docker-ce-cli containerd.io +# install fuse-overlayfs (used by rootless; apt-get version is old) +RUN curl -sSL --retry 5 --output /usr/local/bin/fuse-overlayfs https://github.com/containers/fuse-overlayfs/releases/download/${FUSE_OVERLAYFS_VERSION}/fuse-overlayfs-$(uname -m) \ + && chmod +x /usr/local/bin/fuse-overlayfs + +# install containerd-fuse-overlayfs (used by rootless) +RUN export ARCH=$(dpkg --print-architecture | sed 's/ppc64el/ppc64le/' | sed 's/armhf/arm-v7/') \ + && echo "Installing containerd-fuse-overlayfs..." \ + && export CONTAINERD_FUSE_OVERLAYFS_BASE_URL="https://github.com/containerd/fuse-overlayfs-snapshotter/releases/download/v${CONTAINERD_FUSE_OVERLAYFS_VERSION}" \ + && curl -sSL --retry 5 --output /tmp/containerd-fuse-overlayfs.tgz "${CONTAINERD_FUSE_OVERLAYFS_BASE_URL}/containerd-fuse-overlayfs-${CONTAINERD_FUSE_OVERLAYFS_VERSION}-linux-${ARCH}.tar.gz" \ + && tar -C /usr/local/bin -xzvf /tmp/containerd-fuse-overlayfs.tgz \ + && rm -rf /tmp/containerd-fuse-overlayfs.tgz +COPY deploy/kicbase/containerd-fuse-overlayfs.service /etc/systemd/system/containerd-fuse-overlayfs.service + # install buildkit RUN export ARCH=$(dpkg --print-architecture | sed 's/ppc64el/ppc64le/' | sed 's/armhf/arm-v7/') \ && echo "Installing buildkit ..." \ diff --git a/deploy/kicbase/containerd-fuse-overlayfs.service b/deploy/kicbase/containerd-fuse-overlayfs.service new file mode 100644 index 000000000000..a3d12bd5677a --- /dev/null +++ b/deploy/kicbase/containerd-fuse-overlayfs.service @@ -0,0 +1,13 @@ +# From https://github.com/kubernetes-sigs/kind/blob/0d3780371091b2dc9ff6eea1b6054f14ff5d970a/images/base/files/etc/systemd/system/containerd-fuse-overlayfs.service +[Unit] +Description=containerd fuse-overlayfs snapshotter +PartOf=containerd.service + +[Service] +ExecStart=/usr/local/bin/containerd-fuse-overlayfs-grpc /run/containerd-fuse-overlayfs.sock /var/lib/containerd-fuse-overlayfs +Type=notify +Restart=always +RestartSec=1 + +[Install] +WantedBy=multi-user.target diff --git a/deploy/kicbase/entrypoint b/deploy/kicbase/entrypoint index cc6a9338a054..0a0a26641c3a 100755 --- a/deploy/kicbase/entrypoint +++ b/deploy/kicbase/entrypoint @@ -19,6 +19,39 @@ set -o nounset set -o pipefail set -x +# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. +# Otherwise we are in a non-initial user namespace. +# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 +userns="" +if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then + userns="1" + echo 'INFO: running in a user namespace (experimental)' +fi + +validate_userns() { + if [[ -z "${userns}" ]]; then + return + fi + + local nofile_hard + nofile_hard="$(ulimit -Hn)" + local nofile_hard_expected="64000" + if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then + echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + fi + + if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo "ERROR: UserNS: cgroup v2 needs to be enabled, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2 + exit 1 + fi + for f in cpu memory pids; do + if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then + echo "ERROR: UserNS: $f controller needs to be delegated, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2 + exit 1 + fi + done +} + configure_containerd() { # we need to switch to the 'native' snapshotter on zfs if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then @@ -73,12 +106,16 @@ fix_mount() { sync fi - echo 'INFO: remounting /sys read-only' - # systemd-in-a-container should have read only /sys - # https://systemd.io/CONTAINER_INTERFACE/ - # however, we need other things from `docker run --privileged` ... - # and this flag also happens to make /sys rw, amongst other things - mount -o remount,ro /sys + if [[ -z "${userns}" ]]; then + echo 'INFO: remounting /sys read-only' + # systemd-in-a-container should have read only /sys + # https://systemd.io/CONTAINER_INTERFACE/ + # however, we need other things from `docker run --privileged` ... + # and this flag also happens to make /sys rw, amongst other things + # + # This step is skipped when running inside UserNS, because it fails with EACCES. + mount -o remount,ro /sys + fi echo 'INFO: making mounts shared' >&2 # for mount propagation @@ -334,6 +371,9 @@ enable_network_magic(){ fi } +# validate state +validate_userns + # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races configure_containerd diff --git a/hack/preload-images/generate.go b/hack/preload-images/generate.go index 88ac9024635a..25e98d2b7794 100644 --- a/hack/preload-images/generate.go +++ b/hack/preload-images/generate.go @@ -93,7 +93,7 @@ func generateTarball(kubernetesVersion, containerRuntime, tarballFilename string if err != nil { return errors.Wrap(err, "failed create new runtime") } - if err := cr.Enable(true, false); err != nil { + if err := cr.Enable(true, false, false); err != nil { return errors.Wrap(err, "enable container runtime") } diff --git a/pkg/drivers/kic/oci/network.go b/pkg/drivers/kic/oci/network.go index b8c34533404b..a1cff1762f25 100644 --- a/pkg/drivers/kic/oci/network.go +++ b/pkg/drivers/kic/oci/network.go @@ -34,6 +34,33 @@ import ( // RoutableHostIPFromInside returns the ip/dns of the host that container lives on // is routable from inside the container func RoutableHostIPFromInside(ociBin string, clusterName string, containerName string) (net.IP, error) { + si, err := CachedDaemonInfo(ociBin) + if err != nil { + return nil, err + } + if si.Rootless { + if IsExternalDaemonHost(ociBin) { + return nil, fmt.Errorf("function RoutableHostIPFromInside is not implemented for external rootless daemons") + // TODO: parse DaemonHost() + } + addrs, err := net.InterfaceAddrs() + if err != nil { + return nil, err + } + for _, addr := range addrs { + var ip net.IP + switch v := addr.(type) { + case *net.IPAddr: + ip = v.IP + case *net.IPNet: + ip = v.IP + } + if ip != nil && !ip.IsLoopback() { + return ip, nil + } + } + return nil, fmt.Errorf("could not detect host IP, tried %v", addrs) + } if ociBin == Docker { if runtime.GOOS == "linux" { info, err := containerNetworkInspect(ociBin, clusterName) diff --git a/pkg/drivers/kic/oci/oci.go b/pkg/drivers/kic/oci/oci.go index 258c62c860fb..cacb16a9cc82 100644 --- a/pkg/drivers/kic/oci/oci.go +++ b/pkg/drivers/kic/oci/oci.go @@ -162,6 +162,9 @@ func CreateContainerNode(p CreateParams) error { // including some ones docker would otherwise do by default. // for now this is what we want. in the future we may revisit this. "--privileged", + // enable /dev/fuse explicitly for fuse-overlayfs + // (Rootless Docker does not automatically mount /dev/fuse with --privileged) + "--device", "/dev/fuse", "--security-opt", "seccomp=unconfined", // ignore seccomp "--tmpfs", "/tmp", // various things depend on working /tmp "--tmpfs", "/run", // systemd wants a writable /run diff --git a/pkg/minikube/cruntime/containerd.go b/pkg/minikube/cruntime/containerd.go index aba79f668f8e..ca50cd9892ef 100644 --- a/pkg/minikube/cruntime/containerd.go +++ b/pkg/minikube/cruntime/containerd.go @@ -69,6 +69,12 @@ oom_score = 0 [cgroup] path = "" +[proxy_plugins] +# fuse-overlayfs is used for rootless +[proxy_plugins."fuse-overlayfs"] + type = "snapshot" + address = "/run/containerd-fuse-overlayfs.sock" + [plugins] [plugins.cgroups] no_prometheus = false @@ -80,6 +86,7 @@ oom_score = 0 stats_collect_period = 10 enable_tls_streaming = false max_container_log_line_size = 16384 + restrict_oom_score_adj = {{ .RestrictOOMScoreAdj }} [plugins."io.containerd.grpc.v1.cri"] [plugins."io.containerd.grpc.v1.cri".containerd] @@ -90,7 +97,7 @@ oom_score = 0 SystemdCgroup = {{ .SystemdCgroup }} [plugins.cri.containerd] - snapshotter = "overlayfs" + snapshotter = "{{ .Snapshotter }}" [plugins.cri.containerd.default_runtime] runtime_type = "io.containerd.runc.v2" [plugins.cri.containerd.untrusted_workload_runtime] @@ -193,23 +200,31 @@ func (r *Containerd) Available() error { } // generateContainerdConfig sets up /etc/containerd/config.toml -func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semver.Version, forceSystemd bool, insecureRegistry []string) error { +func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semver.Version, forceSystemd bool, insecureRegistry []string, inUserNamespace bool) error { cPath := containerdConfigFile t, err := template.New("containerd.config.toml").Parse(containerdConfigTemplate) if err != nil { return err } pauseImage := images.Pause(kv, imageRepository) + snapshotter := "overlayfs" + if inUserNamespace { + snapshotter = "fuse-overlayfs" + } opts := struct { PodInfraContainerImage string SystemdCgroup bool InsecureRegistry []string CNIConfDir string + RestrictOOMScoreAdj bool + Snapshotter string }{ PodInfraContainerImage: pauseImage, SystemdCgroup: forceSystemd, InsecureRegistry: insecureRegistry, CNIConfDir: cni.ConfDir, + RestrictOOMScoreAdj: inUserNamespace, + Snapshotter: snapshotter, } var b bytes.Buffer if err := t.Execute(&b, opts); err != nil { @@ -223,7 +238,7 @@ func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semve } // Enable idempotently enables containerd on a host -func (r *Containerd) Enable(disOthers, forceSystemd bool) error { +func (r *Containerd) Enable(disOthers, forceSystemd, inUserNamespace bool) error { if disOthers { if err := disableOthers(r, r.Runner); err != nil { klog.Warningf("disableOthers: %v", err) @@ -232,13 +247,19 @@ func (r *Containerd) Enable(disOthers, forceSystemd bool) error { if err := populateCRIConfig(r.Runner, r.SocketPath()); err != nil { return err } - if err := generateContainerdConfig(r.Runner, r.ImageRepository, r.KubernetesVersion, forceSystemd, r.InsecureRegistry); err != nil { + if err := generateContainerdConfig(r.Runner, r.ImageRepository, r.KubernetesVersion, forceSystemd, r.InsecureRegistry, inUserNamespace); err != nil { return err } if err := enableIPForwarding(r.Runner); err != nil { return err } + if inUserNamespace { + if err := r.Init.EnableNow("containerd-fuse-overlayfs"); err != nil { + return err + } + } + // Otherwise, containerd will fail API requests with 'Unimplemented' return r.Init.Restart("containerd") } diff --git a/pkg/minikube/cruntime/crio.go b/pkg/minikube/cruntime/crio.go index 1bca1c5eeb95..6ea8a8413f94 100644 --- a/pkg/minikube/cruntime/crio.go +++ b/pkg/minikube/cruntime/crio.go @@ -139,7 +139,10 @@ func enableIPForwarding(cr CommandRunner) error { } // Enable idempotently enables CRIO on a host -func (r *CRIO) Enable(disOthers, _ bool) error { +func (r *CRIO) Enable(disOthers, _, inUserNamespace bool) error { + if inUserNamespace { + return errors.New("inUserNamespace must not be true for cri-o (yet)") + } if disOthers { if err := disableOthers(r, r.Runner); err != nil { klog.Warningf("disableOthers: %v", err) diff --git a/pkg/minikube/cruntime/cruntime.go b/pkg/minikube/cruntime/cruntime.go index d736d809ce6b..4d42c65f8579 100644 --- a/pkg/minikube/cruntime/cruntime.go +++ b/pkg/minikube/cruntime/cruntime.go @@ -78,7 +78,7 @@ type Manager interface { // Version retrieves the current version of this runtime Version() (string, error) // Enable idempotently enables this runtime on a host - Enable(bool, bool) error + Enable(bool, bool, bool) error // Disable idempotently disables this runtime on a host Disable() error // Active returns whether or not a runtime is active on a host diff --git a/pkg/minikube/cruntime/cruntime_test.go b/pkg/minikube/cruntime/cruntime_test.go index 3a55059cc4fe..b87212b4a738 100644 --- a/pkg/minikube/cruntime/cruntime_test.go +++ b/pkg/minikube/cruntime/cruntime_test.go @@ -668,7 +668,7 @@ func TestEnable(t *testing.T) { if err != nil { t.Fatalf("New(%s): %v", tc.runtime, err) } - err = cr.Enable(true, false) + err = cr.Enable(true, false, false) if err != nil { t.Errorf("%s disable unexpected error: %v", tc.runtime, err) } diff --git a/pkg/minikube/cruntime/docker.go b/pkg/minikube/cruntime/docker.go index d3de483deee6..30772658122a 100644 --- a/pkg/minikube/cruntime/docker.go +++ b/pkg/minikube/cruntime/docker.go @@ -107,7 +107,10 @@ func (r *Docker) Active() bool { } // Enable idempotently enables Docker on a host -func (r *Docker) Enable(disOthers, forceSystemd bool) error { +func (r *Docker) Enable(disOthers, forceSystemd, inUserNamespace bool) error { + if inUserNamespace { + return errors.New("inUserNamespace must not be true for docker") + } containerdWasActive := r.Init.Active("containerd") if disOthers { diff --git a/pkg/minikube/driver/driver.go b/pkg/minikube/driver/driver.go index 6481ff632f78..3a76001a00a6 100644 --- a/pkg/minikube/driver/driver.go +++ b/pkg/minikube/driver/driver.go @@ -185,7 +185,15 @@ func NeedsPortForward(name string) bool { return true } // Docker for Desktop - return runtime.GOOS == "darwin" || runtime.GOOS == "windows" || detect.IsMicrosoftWSL() + if runtime.GOOS == "darwin" || runtime.GOOS == "windows" || detect.IsMicrosoftWSL() { + return true + } + + si, err := oci.CachedDaemonInfo(name) + if err != nil { + panic(err) + } + return si.Rootless } // HasResourceLimits returns true if driver can set resource limits such as memory size or CPU count. diff --git a/pkg/minikube/node/start.go b/pkg/minikube/node/start.go index 1361c67ebb06..2d45fabbaf20 100644 --- a/pkg/minikube/node/start.go +++ b/pkg/minikube/node/start.go @@ -349,7 +349,8 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k } } - err = cr.Enable(disableOthers, forceSystemd()) + inUserNamespace := strings.Contains(cc.KubernetesConfig.FeatureGates, "KubeletInUserNamespace=true") + err = cr.Enable(disableOthers, forceSystemd(), inUserNamespace) if err != nil { exit.Error(reason.RuntimeEnable, "Failed to enable container runtime", err) } diff --git a/pkg/minikube/registry/drvs/docker/docker.go b/pkg/minikube/registry/drvs/docker/docker.go index a7aaea2ac89f..cc6bb558b8e4 100644 --- a/pkg/minikube/registry/drvs/docker/docker.go +++ b/pkg/minikube/registry/drvs/docker/docker.go @@ -146,14 +146,7 @@ func status() (retState registry.State) { return suggestFix("info", -1, serr, fmt.Errorf("docker info error: %s", serr)) } - if si.Rootless { - return registry.State{ - Reason: "PROVIDER_DOCKER_ROOTLESS", - Error: errors.New("rootless Docker not supported yet"), - Installed: true, - Healthy: false, - Doc: "https://github.com/kubernetes/minikube/issues/10836"} - } + // TODO: validate cgroup v2 delegation when si.Rootless is true return checkNeedsImprovement() } diff --git a/site/content/en/docs/drivers/docker.md b/site/content/en/docs/drivers/docker.md index 1300cd086715..4dad78879b96 100644 --- a/site/content/en/docs/drivers/docker.md +++ b/site/content/en/docs/drivers/docker.md @@ -21,7 +21,6 @@ The Docker driver allows you to install Kubernetes into an existing Docker insta - The following Docker runtime security options are currently *unsupported and will not work* with the Docker driver (see [#9607](https://github.com/kubernetes/minikube/issues/9607)): - [userns-remap](https://docs.docker.com/engine/security/userns-remap/) - - [rootless](https://docs.docker.com/engine/security/rootless/) - On macOS, containers might get hung and require a restart of Docker for Desktop. See [docker/for-mac#1835](https://github.com/docker/for-mac/issues/1835) diff --git a/site/content/en/docs/drivers/includes/docker_usage.inc b/site/content/en/docs/drivers/includes/docker_usage.inc index 8b1fd6c694ad..63d50ca323cc 100644 --- a/site/content/en/docs/drivers/includes/docker_usage.inc +++ b/site/content/en/docs/drivers/includes/docker_usage.inc @@ -16,3 +16,24 @@ To make docker the default driver: minikube config set driver docker ``` +## Rootless Docker +### Requirements +- Docker 20.10 or higher, see https://rootlesscontaine.rs/getting-started/docker/ +- Cgroup v2 delegation, see https://rootlesscontaine.rs/getting-started/common/cgroup2/ + +### Usage + +Start a cluster using the rootless docker driver: + +```shell +dockerd-rootless-setuptool.sh install -f +docker context use rootless + +minikube start --driver=docker --container-runtime=containerd +``` + +The `--container-runtime` flag must be currently set to "containerd". + +The restrictions of rootless `kind` apply to minikube with rootless docker as well. + +See https://kind.sigs.k8s.io/docs/user/rootless/ .