From 73393fc6725fc7dd8928b0bbf62cc0aed9203343 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Thu, 7 Jan 2021 20:03:50 +0100 Subject: [PATCH] Implement the rootless-cni-infra container imageless As proposed by Akihiro Suda make the rootless-cni-infra container use the host rootfs instead of an image. This works by mounting the host rootfs in the user namespace to `$runroot/rootless-cni-infra` and use this as rootfs for the container. Second, rewrite the rootless-cni-infra shell script in go to remove the extra cnitool dependency which is not packaged anywhere. With that we only need the same dependencies as rootful podman which should be already installed. Advantages: - Works for all architectures podman supports. - Works without internet connection. - No extra maintainence of an extra image. Disadvantages: - Requires the dependencies to be available on the host (e.g. dnsname plugin). The user may not have control over those. Problems: - It doesn't unmount the rootfs if the the rootless-cni-infra container is stopped directly. Also the image version did not respect the `--cni-config-dir` option properly. It mounted the cni config dir only at container create time but this option can be used on podman run commands which did not worked if the rootless-cni-infra container was already running. This is only possible with the rootfs version. Live upgrading is possible. If the old infra container is still running podman talks via the old api to the script. Once the old infra container is deleted the new imageless infra container will be created and podman can talk via the new api. A version label is added to the container to distinguish between old and new. Signed-off-by: Paul Holzinger --- contrib/rootless-cni-infra/Containerfile | 36 -- contrib/rootless-cni-infra/README.md | 9 +- contrib/rootless-cni-infra/rootless-cni-infra | 181 -------- libpod/network/create.go | 6 +- libpod/rootless_cni_linux.go | 344 ++++++++++----- pkg/rootless/cni/rootless_cni.go | 391 ++++++++++++++++++ test/system/500-networking.bats | 7 - 7 files changed, 627 insertions(+), 347 deletions(-) delete mode 100644 contrib/rootless-cni-infra/Containerfile delete mode 100755 contrib/rootless-cni-infra/rootless-cni-infra create mode 100644 pkg/rootless/cni/rootless_cni.go diff --git a/contrib/rootless-cni-infra/Containerfile b/contrib/rootless-cni-infra/Containerfile deleted file mode 100644 index 4324f39d2b..0000000000 --- a/contrib/rootless-cni-infra/Containerfile +++ /dev/null @@ -1,36 +0,0 @@ -ARG GOLANG_VERSION=1.15 -ARG ALPINE_VERSION=3.12 -ARG CNI_VERSION=v0.8.0 -ARG CNI_PLUGINS_VERSION=v0.8.7 -ARG DNSNAME_VERSION=v1.1.1 - -FROM golang:${GOLANG_VERSION}-alpine${ALPINE_VERSION} AS golang-base -RUN apk add --no-cache git - -FROM golang-base AS cnitool -RUN git clone https://github.com/containernetworking/cni /go/src/github.com/containernetworking/cni -WORKDIR /go/src/github.com/containernetworking/cni -ARG CNI_VERSION -RUN git checkout ${CNI_VERSION} -RUN go build -o /cnitool ./cnitool - -FROM golang-base AS dnsname -RUN git clone https://github.com/containers/dnsname /go/src/github.com/containers/dnsname -WORKDIR /go/src/github.com/containers/dnsname -ARG DNSNAME_VERSION -RUN git checkout ${DNSNAME_VERSION} -RUN go build -o /dnsname ./plugins/meta/dnsname - -FROM alpine:${ALPINE_VERSION} -RUN apk add --no-cache curl dnsmasq iptables ip6tables iproute2 -ARG TARGETARCH -ARG CNI_PLUGINS_VERSION -RUN mkdir -p /opt/cni/bin && \ - curl -fsSL https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/cni-plugins-linux-${TARGETARCH}-${CNI_PLUGINS_VERSION}.tgz | tar xz -C /opt/cni/bin -COPY --from=cnitool /cnitool /usr/local/bin -COPY --from=dnsname /dnsname /opt/cni/bin -COPY rootless-cni-infra /usr/local/bin -ENV CNI_PATH=/opt/cni/bin -CMD ["sleep", "infinity"] - -ENV ROOTLESS_CNI_INFRA_VERSION=5 diff --git a/contrib/rootless-cni-infra/README.md b/contrib/rootless-cni-infra/README.md index c43b4cf491..dc21791a7e 100644 --- a/contrib/rootless-cni-infra/README.md +++ b/contrib/rootless-cni-infra/README.md @@ -7,19 +7,16 @@ Infra container for CNI-in-slirp4netns. When a CNI network is specified for `podman run` in rootless mode, Podman launches the `rootless-cni-infra` container to execute CNI plugins inside slirp4netns. The infra container is created per user, by executing an equivalent of: -`podman run -d --name rootless-cni-infra --pid=host --privileged -v $HOME/.config/cni/net.d:/etc/cni/net.d rootless-cni-infra`. +`podman run -d --name rootless-cni-infra --pid=host --privileged --rootfs /`. The infra container is automatically deleted when no CNI network is in use. Podman then allocates a CNI netns in the infra container, by executing an equivalent of: -`podman exec rootless-cni-infra rootless-cni-infra alloc $CONTAINER_ID $NETWORK_NAME $POD_NAME`. +`podman exec rootless-cni-infra rootless-cni-infra alloc $CONTAINER_ID $NETWORK_NAME`. The allocated netns is deallocated when the container is being removed, by executing an equivalent of: `podman exec rootless-cni-infra rootless-cni-infra dealloc $CONTAINER_ID $NETWORK_NAME`. -The container images live on `quay.io/libpod/rootless-cni-infra`. The tags have the format `$version-$architecture`. Please make sure to increase the version number in the Containerfile (i.e., `ROOTLESS_CNI_INFRA_VERSION`) when applying changes to this directory. After committing the changes, upload the image(s) with the corresponding tag. - ## Directory layout -* `/run/rootless-cni-infra/${CONTAINER_ID}/pid`: PID of the `sleep infinity` process that corresponds to the allocated netns -* `/run/rootless-cni-infra/${CONTAINER_ID}/attached/${NETWORK_NAME}`: CNI result +* `/run/rootless-cni-infra/${CONTAINER_ID}/pid`: PID of the `sleep` process that corresponds to the allocated netns * `/run/rootless-cni-infra/${CONTAINER_ID}/attached-args/${NETWORK_NAME}`: CNI args diff --git a/contrib/rootless-cni-infra/rootless-cni-infra b/contrib/rootless-cni-infra/rootless-cni-infra deleted file mode 100755 index cceb8d817a..0000000000 --- a/contrib/rootless-cni-infra/rootless-cni-infra +++ /dev/null @@ -1,181 +0,0 @@ -#!/bin/sh -set -eu - -ARG0="$0" -BASE="/run/rootless-cni-infra" - -wait_unshare_net() { - pid="$1" - # NOTE: busybox shell doesn't support the `for ((i=0; i < $MAX; i++)); do foo; done` statement - i=0 - while :; do - if [ "$(readlink /proc/self/ns/net)" != "$(readlink /proc/${pid}/ns/net)" ]; then - break - fi - sleep 0.1 - if [ $i -ge 10 ]; then - echo >&2 "/proc/${pid}/ns/net cannot be unshared" - exit 1 - fi - i=$((i + 1)) - done -} - -# CLI subcommand: "alloc $CONTAINER_ID $NETWORK_NAME $POD_NAME $IP $MAC $CAP_ARGS" -cmd_entrypoint_alloc() { - if [ "$#" -ne 6 ]; then - echo >&2 "Usage: $ARG0 alloc CONTAINER_ID NETWORK_NAME POD_NAME IP MAC CAP_ARGS" - exit 1 - fi - - ID="$1" - NET="$2" - K8S_POD_NAME="$3" - IP="$4" - MAC="$5" - CAP_ARGS="$6" - - dir="${BASE}/${ID}" - mkdir -p "${dir}/attached" "${dir}/attached-args" - - pid="" - if [ -f "${dir}/pid" ]; then - pid=$(cat "${dir}/pid") - else - unshare -n sleep infinity & - pid="$!" - wait_unshare_net "${pid}" - echo "${pid}" >"${dir}/pid" - nsenter -t "${pid}" -n ip link set lo up - fi - CNI_ARGS="IgnoreUnknown=1;K8S_POD_NAME=${K8S_POD_NAME}" - if [ "$IP" ]; then - CNI_ARGS="$CNI_ARGS;IP=${IP}" - fi - if [ "$MAC" ]; then - CNI_ARGS="$CNI_ARGS;MAC=${MAC}" - fi - if [ "$CAP_ARGS" ]; then - CAP_ARGS="$CAP_ARGS" - fi - nwcount=$(find "${dir}/attached" -type f | wc -l) - CNI_IFNAME="eth${nwcount}" - export CNI_ARGS CNI_IFNAME CAP_ARGS - cnitool add "${NET}" "/proc/${pid}/ns/net" >"${dir}/attached/${NET}" - echo "${CNI_ARGS}" >"${dir}/attached-args/${NET}" - - # return the result - ns="/proc/${pid}/ns/net" - echo "{\"ns\":\"${ns}\"}" -} - -# CLI subcommand: "dealloc $CONTAINER_ID $NETWORK_NAME" -cmd_entrypoint_dealloc() { - if [ "$#" -ne 2 ]; then - echo >&2 "Usage: $ARG0 dealloc CONTAINER_ID NETWORK_NAME" - exit 1 - fi - - ID=$1 - NET=$2 - - dir="${BASE}/${ID}" - if [ ! -f "${dir}/pid" ]; then - exit 0 - fi - pid=$(cat "${dir}/pid") - if [ -f "${dir}/attached-args/${NET}" ]; then - CNI_ARGS=$(cat "${dir}/attached-args/${NET}") - export CNI_ARGS - fi - cnitool del "${NET}" "/proc/${pid}/ns/net" - rm -f "${dir}/attached/${NET}" "${dir}/attached-args/${NET}" - - nwcount=$(find "${dir}/attached" -type f | wc -l) - if [ "${nwcount}" = 0 ]; then - kill -9 "${pid}" - rm -rf "${dir}" - fi - - # return empty json - echo "{}" -} - -# CLI subcommand: "is-idle" -cmd_entrypoint_is_idle() { - if [ ! -d ${BASE} ]; then - echo '{"idle": true}' - elif [ -z "$(ls -1 ${BASE})" ]; then - echo '{"idle": true}' - else - echo '{"idle": false}' - fi -} - -# CLI subcommand: "print-cni-result $CONTAINER_ID $NETWORK_NAME" -cmd_entrypoint_print_cni_result() { - if [ "$#" -ne 2 ]; then - echo >&2 "Usage: $ARG0 print-cni-result CONTAINER_ID NETWORK_NAME" - exit 1 - fi - - ID=$1 - NET=$2 - - # the result shall be CNI JSON - cat "${BASE}/${ID}/attached/${NET}" -} - -# CLI subcommand: "print-netns-path $CONTAINER_ID" -cmd_entrypoint_print_netns_path() { - if [ "$#" -ne 1 ]; then - echo >&2 "Usage: $ARG0 print-netns-path CONTAINER_ID" - exit 1 - fi - - ID=$1 - - pid=$(cat "${BASE}/${ID}/pid") - path="/proc/${pid}/ns/net" - - # return the result - echo "{\"path\":\"${path}\"}" -} - -# CLI subcommand: "help" -cmd_entrypoint_help() { - echo "Usage: ${ARG0} COMMAND" - echo - echo "Rootless CNI Infra container" - echo - echo "Commands:" - echo " alloc Allocate a netns" - echo " dealloc Deallocate a netns" - echo " is-idle Print whether the infra container is idle" - echo " print-cni-result Print CNI result" - echo " print-netns-path Print netns path" - echo " help Print help" - echo " version Print version" -} - -# CLI subcommand: "version" -cmd_entrypoint_version() { - echo "{\"version\": \"${ROOTLESS_CNI_INFRA_VERSION}\"}" -} - -# parse args -command="${1:-}" -if [ -z "$command" ]; then - echo >&2 "No command was specified. Run \`${ARG0} help\` to see the usage." - exit 1 -fi - -command_func=$(echo "cmd_entrypoint_${command}" | sed -e "s/-/_/g") -if ! command -v "${command_func}" >/dev/null 2>&1; then - echo >&2 "Unknown command: ${command}. Run \`${ARG0} help\` to see the usage." - exit 1 -fi - -# start the command func -shift -"${command_func}" "$@" diff --git a/libpod/network/create.go b/libpod/network/create.go index c58d625756..210b1a44b8 100644 --- a/libpod/network/create.go +++ b/libpod/network/create.go @@ -11,7 +11,6 @@ import ( "github.com/containernetworking/cni/pkg/version" "github.com/containers/common/pkg/config" "github.com/containers/podman/v2/pkg/domain/entities" - "github.com/containers/podman/v2/pkg/rootless" "github.com/containers/podman/v2/pkg/util" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -223,9 +222,8 @@ func createBridge(name string, options entities.NetworkCreateOptions, runtimeCon plugins = append(plugins, NewPortMapPlugin()) plugins = append(plugins, NewFirewallPlugin()) plugins = append(plugins, NewTuningPlugin()) - // if we find the dnsname plugin or are rootless, we add configuration for it - // the rootless-cni-infra container has the dnsname plugin always installed - if (HasDNSNamePlugin(runtimeConfig.Network.CNIPluginDirs) || rootless.IsRootless()) && !options.DisableDNS { + // if we find the dnsname plugin installed include it in the config + if HasDNSNamePlugin(runtimeConfig.Network.CNIPluginDirs) && !options.DisableDNS { if options.Internal { logrus.Warnf("dnsname and --internal networks are incompatible. dnsname plugin not configured for network %s", name) } else { diff --git a/libpod/rootless_cni_linux.go b/libpod/rootless_cni_linux.go index 94ae062aa4..0d525989d2 100644 --- a/libpod/rootless_cni_linux.go +++ b/libpod/rootless_cni_linux.go @@ -3,19 +3,21 @@ package libpod import ( + "bufio" "bytes" "context" "io" + "os" "path/filepath" - "runtime" + "strconv" cnitypes "github.com/containernetworking/cni/pkg/types/current" "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/podman/v2/libpod/define" - "github.com/containers/podman/v2/libpod/image" - "github.com/containers/podman/v2/pkg/env" - "github.com/containers/podman/v2/pkg/util" + "github.com/containers/podman/v2/libpod/network" + rootlesscni "github.com/containers/podman/v2/pkg/rootless/cni" "github.com/containers/storage/pkg/lockfile" + "github.com/containers/storage/pkg/mount" "github.com/hashicorp/go-multierror" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" @@ -23,16 +25,42 @@ import ( "github.com/sirupsen/logrus" ) -// Built from ../contrib/rootless-cni-infra. -var rootlessCNIInfraImage = map[string]string{ - "amd64": "quay.io/libpod/rootless-cni-infra@sha256:adf352454666f7ce9ca3e1098448b5ee18f89c4516471ec99447ec9ece917f36", // 5-amd64 -} - const ( - rootlessCNIInfraContainerNamespace = "podman-system" - rootlessCNIInfraContainerName = "rootless-cni-infra" + rootlessCNIInfraContainerNamespace = "podman-system" + rootlessCNIInfraContainerName = "rootless-cni-infra" + rootlessCNIInfraContainerVersionLabelName = "rootless-cni-infra-container-version" ) +func getRootlessCNIConfig(c *Container, network string) (*rootlesscni.Config, error) { + conf := rootlesscni.Config{ + ID: c.ID(), + Network: network, + CNIPodName: getCNIPodName(c), + PluginPaths: c.runtime.config.Network.CNIPluginDirs, + NetConfPath: c.runtime.config.Network.NetworkConfigDir, + } + // add static ip if given + if c.config.StaticIP != nil { + conf.IP = c.config.StaticIP.String() + } + // add static mac if given + if c.config.StaticMAC != nil { + conf.MAC = c.config.StaticMAC.String() + } + // add aliases as CapabilityArgs + aliases, err := c.runtime.state.GetAllNetworkAliases(c) + if err != nil { + return nil, err + } + conf.Aliases = aliases + if eth, exists := c.state.NetInterfaceDescriptions.getInterfaceByName(network); exists { + conf.InterfaceName = eth + } else { + return nil, errors.Errorf("no network interface name for %s", network) + } + return &conf, nil +} + // AllocRootlessCNI allocates a CNI netns inside the rootless CNI infra container. // Locks "rootless-cni-infra.lck". // @@ -47,6 +75,20 @@ func AllocRootlessCNI(ctx context.Context, c *Container) (ns.NetNS, []*cnitypes. if len(networks) == 0 { return nil, nil, errors.New("rootless CNI networking requires that the container has joined at least one CNI network") } + // check early that all given networks exists + for _, nw := range networks { + exists, err := network.Exists(c.runtime.config, nw) + if err != nil { + return nil, nil, err + } + if !exists { + return nil, nil, errors.Errorf("CNI network %q not found", nw) + } + } + // Update container map of interface descriptions + if err := c.setupNetworkDescriptions(networks); err != nil { + return nil, nil, err + } l, err := getRootlessCNIInfraLock(c.runtime) if err != nil { return nil, nil, err @@ -57,34 +99,14 @@ func AllocRootlessCNI(ctx context.Context, c *Container) (ns.NetNS, []*cnitypes. if err != nil { return nil, nil, err } - k8sPodName := getCNIPodName(c) // passed to CNI as K8S_POD_NAME - ip := "" - if c.config.StaticIP != nil { - ip = c.config.StaticIP.String() - } - mac := "" - if c.config.StaticMAC != nil { - mac = c.config.StaticMAC.String() - } - aliases, err := c.runtime.state.GetAllNetworkAliases(c) - if err != nil { - return nil, nil, err - } - capArgs := "" - // add network aliases json encoded as capabilityArgs for cni - if len(aliases) > 0 { - capabilityArgs := make(map[string]interface{}) - capabilityArgs["aliases"] = aliases - b, err := json.Marshal(capabilityArgs) + cniResults := make([]*cnitypes.Result, len(networks)) + for i, nw := range networks { + rootlessCNIConf, err := getRootlessCNIConfig(c, nw) if err != nil { return nil, nil, err } - capArgs = string(b) - } - cniResults := make([]*cnitypes.Result, len(networks)) - for i, nw := range networks { - cniRes, err := rootlessCNIInfraCallAlloc(infra, c.ID(), nw, k8sPodName, ip, mac, capArgs) + cniRes, err := rootlessCNIInfraCallAlloc(infra, rootlessCNIConf) if err != nil { return nil, nil, err } @@ -124,7 +146,11 @@ func DeallocRootlessCNI(ctx context.Context, c *Container) error { } var errs *multierror.Error for _, nw := range networks { - err := rootlessCNIInfraCallDealloc(infra, c.ID(), nw) + rootlessCNIConf, err := getRootlessCNIConfig(c, nw) + if err != nil { + return err + } + err = rootlessCNIInfraCallDealloc(infra, rootlessCNIConf) if err != nil { errs = multierror.Append(errs, err) } @@ -134,11 +160,21 @@ func DeallocRootlessCNI(ctx context.Context, c *Container) error { logrus.Warn(err) } logrus.Debugf("rootless CNI: removing infra container %q", infra.ID()) + // Kill the infra container. There is no need to cleanup files because + // are stored in tmpfs so we can just sigkill it. It is important to kill the + // container before we remove it otherwise we have locking issues. + if err := infra.Kill(9); err != nil { + logrus.Error(err) + } infra.lock.Lock() defer infra.lock.Unlock() if err := c.runtime.removeContainer(ctx, infra, true, false, true); err != nil { return err } + rootfs := filepath.Join(c.runtime.GetStore().RunRoot(), rootlessCNIInfraContainerName) + if err := mount.RecursiveUnmount(rootfs); err != nil { + return errors.Wrapf(err, "failed to unmount rootfs for %s", rootlessCNIInfraContainerName) + } logrus.Debugf("rootless CNI: removed infra container %q", infra.ID()) } return errs.ErrorOrNil() @@ -161,57 +197,94 @@ func getCNIPodName(c *Container) string { return c.Name() } -func rootlessCNIInfraCallAlloc(infra *Container, id, nw, k8sPodName, ip, mac, capArgs string) (*cnitypes.Result, error) { - logrus.Debugf("rootless CNI: alloc %q, %q, %q, %q, %q, %q", id, nw, k8sPodName, ip, mac, capArgs) +func rootlessCNIInfraCallAlloc(infra *Container, cfg *rootlesscni.Config) (*cnitypes.Result, error) { + logrus.Debugf("rootless CNI: alloc %v", cfg) var err error + var cniRes cnitypes.Result + var cniResBytes []byte + labels := infra.Labels() + // we might want to check for the version here but for now the existence of the label is fine + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; ok { + bytes, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + cniResBytes, err = rootlessCNIInfraExec(infra, bytes, "alloc") + if err != nil { + return nil, err + } + } else { + // old rootless-cni-infra container api + // keep for backwarts compatibility with previous version to support live migration + // TODO: remove this in a future release maybe 4.0? - _, err = rootlessCNIInfraExec(infra, "alloc", id, nw, k8sPodName, ip, mac, capArgs) - if err != nil { - return nil, err - } - cniResStr, err := rootlessCNIInfraExec(infra, "print-cni-result", id, nw) - if err != nil { - return nil, err + // add network aliases json encoded as capabilityArgs for cni + capArgs := "" + if len(cfg.Aliases) > 0 { + capabilityArgs := make(map[string]interface{}) + capabilityArgs["aliases"] = cfg.Aliases + b, err := json.Marshal(capabilityArgs) + if err != nil { + return nil, err + } + capArgs = string(b) + } + _, err = rootlessCNIInfraExec(infra, nil, "alloc", cfg.ID, cfg.Network, cfg.CNIPodName, cfg.IP, cfg.MAC, capArgs) + if err != nil { + return nil, err + } + cniResBytes, err = rootlessCNIInfraExec(infra, nil, "print-cni-result", cfg.ID, cfg.Network) + if err != nil { + return nil, err + } } - var cniRes cnitypes.Result - if err := json.Unmarshal([]byte(cniResStr), &cniRes); err != nil { - return nil, errors.Wrapf(err, "unmarshaling as cnitypes.Result: %q", cniResStr) + if err := json.Unmarshal(cniResBytes, &cniRes); err != nil { + return nil, errors.Wrapf(err, "unmarshaling as cnitypes.Result: %q", string(cniResBytes)) } return &cniRes, nil } -func rootlessCNIInfraCallDealloc(infra *Container, id, nw string) error { - logrus.Debugf("rootless CNI: dealloc %q, %q", id, nw) - _, err := rootlessCNIInfraExec(infra, "dealloc", id, nw) +func rootlessCNIInfraCallDealloc(infra *Container, cfg *rootlesscni.Config) error { + logrus.Debugf("rootless CNI: dealloc %v", cfg) + var err error + labels := infra.Labels() + // we might want to check for the version here but for now the existence of the label is fine + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; ok { + var bytes []byte + bytes, err = json.Marshal(cfg) + if err != nil { + return err + } + _, err = rootlessCNIInfraExec(infra, bytes, "dealloc") + } else { + // old rootless-cni-infra container api + // keep for backwarts compatibility with previous version to support live migration + // TODO: remove this in a future release maybe 4.0? + _, err = rootlessCNIInfraExec(infra, nil, "dealloc", cfg.ID, cfg.Network) + } return err } func rootlessCNIInfraIsIdle(infra *Container) (bool, error) { - type isIdle struct { - Idle bool `json:"idle"` - } - resStr, err := rootlessCNIInfraExec(infra, "is-idle") + resBytes, err := rootlessCNIInfraExec(infra, nil, "is-idle") if err != nil { return false, err } - var res isIdle - if err := json.Unmarshal([]byte(resStr), &res); err != nil { - return false, errors.Wrapf(err, "unmarshaling as isIdle: %q", resStr) + var res rootlesscni.IsIdle + if err := json.Unmarshal(resBytes, &res); err != nil { + return false, errors.Wrapf(err, "unmarshaling as IsIdle: %q", string(resBytes)) } return res.Idle, nil } -func rootlessCNIInfraGetNS(infra *Container, id string) (ns.NetNS, error) { - type printNetnsPath struct { - Path string `json:"path"` - } - resStr, err := rootlessCNIInfraExec(infra, "print-netns-path", id) +func rootlessCNIInfraGetNS(infra *Container, cid string) (ns.NetNS, error) { + resBytes, err := rootlessCNIInfraExec(infra, nil, "print-netns-path", cid) if err != nil { return nil, err } - var res printNetnsPath - if err := json.Unmarshal([]byte(resStr), &res); err != nil { - return nil, errors.Wrapf(err, "unmarshaling as printNetnsPath: %q", resStr) + var res rootlesscni.PrintNetnsPath + if err := json.Unmarshal(resBytes, &res); err != nil { + return nil, errors.Wrapf(err, "unmarshaling as PrintNetnsPath: %q", string(resBytes)) } nsObj, err := ns.GetNS(res.Path) if err != nil { @@ -250,6 +323,12 @@ func ensureRootlessCNIInfraContainerRunning(ctx context.Context, r *Runtime) (*C logrus.Debugf("rootless CNI: infra container %q is already running", c.ID()) return c, nil } + // we have to mount the rootfs before we start it + rootfs := filepath.Join(r.GetStore().RunRoot(), rootlessCNIInfraContainerName) + err = mountRootlessCNIINfraRootfs(rootfs) + if err != nil { + return nil, err + } logrus.Debugf("rootless CNI: infra container %q is %q, being started", c.ID(), st.State) if err := c.initAndStart(ctx); err != nil { return nil, err @@ -259,18 +338,6 @@ func ensureRootlessCNIInfraContainerRunning(ctx context.Context, r *Runtime) (*C } func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container, error) { - imageName, ok := rootlessCNIInfraImage[runtime.GOARCH] - if !ok { - return nil, errors.Errorf("cannot find rootless-podman-network-sandbox image for %s", runtime.GOARCH) - } - logrus.Debugf("rootless CNI: ensuring image %q to exist", imageName) - newImage, err := r.ImageRuntime().New(ctx, imageName, "", "", nil, nil, - image.SigningOptions{}, nil, util.PullImageMissing) - if err != nil { - return nil, err - } - logrus.Debugf("rootless CNI: image %q is ready", imageName) - g, err := generate.New("linux") if err != nil { return nil, err @@ -281,46 +348,72 @@ func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container return nil, err } g.RemoveMount("/proc") - procMount := spec.Mount{ - Destination: "/proc", - Type: "bind", - Source: "/proc", - Options: []string{"rbind", "nosuid", "noexec", "nodev"}, + + // need writable /run + run := spec.Mount{ + Destination: "/run", + Type: "tmpfs", + Source: "none", + Options: []string{"rw", "nosuid", "nodev"}, + } + g.AddMount(run) + + // mount /var as tmpfs + // On ungraceful shutdown cni leaves the ip allocation files in place. + // This causes issues when we try to use containers with the same ip again. + // The best way to clean them up is using a tmpfs mount. These files do not have to + // be persistent since the network namespace is destroyed anyway if the container exits. + // CNI tries to write to /var/lib/cni however we cannot mount there because + // it might not exists and we have no permission to create this directory. + cni := spec.Mount{ + Destination: "/var", + Type: "tmpfs", + Source: "none", + Options: []string{"rw", "nosuid", "nodev"}, + } + g.AddMount(cni) + + g.SetProcessArgs([]string{rootlesscni.InfraCmd, "sleep"}) + + // get the current path this executable so we can mount it + podmanexe, err := os.Executable() + if err != nil { + return nil, err } - g.AddMount(procMount) - // Mount CNI networks - etcCNINetD := spec.Mount{ - Destination: "/etc/cni/net.d", + podman := spec.Mount{ + // mount with different name to trigger the reexec for rooless-cni-infra + Destination: rootlesscni.InfraCmd, Type: "bind", - Source: r.config.Network.NetworkConfigDir, + Source: podmanexe, Options: []string{"ro", "bind"}, } - g.AddMount(etcCNINetD) + g.AddMount(podman) - inspectData, err := newImage.Inspect(ctx) + rootfs := filepath.Join(r.GetStore().RunRoot(), rootlessCNIInfraContainerName) + err = mountRootlessCNIINfraRootfs(rootfs) if err != nil { return nil, err } - imageEnv, err := env.ParseSlice(inspectData.Config.Env) - if err != nil { - return nil, err - } - for k, v := range imageEnv { - g.AddProcessEnv(k, v) - } - if len(inspectData.Config.Cmd) == 0 { - return nil, errors.Errorf("rootless CNI infra image %q has no command specified", imageName) + + g.SetRootReadonly(true) + g.SetHostname(rootlessCNIInfraContainerName) + + infraLabels := map[string]string{ + rootlessCNIInfraContainerVersionLabelName: strconv.Itoa(rootlesscni.Version), } - g.SetProcessArgs(inspectData.Config.Cmd) - var options []CtrCreateOption - options = append(options, WithRootFSFromImage(newImage.ID(), imageName, imageName)) - options = append(options, WithCtrNamespace(rootlessCNIInfraContainerNamespace)) - options = append(options, WithName(rootlessCNIInfraContainerName)) - options = append(options, WithPrivileged(true)) - options = append(options, WithSecLabels([]string{"disable"})) - options = append(options, WithRestartPolicy("always")) - options = append(options, WithNetNS(nil, false, "slirp4netns", nil)) + options := []CtrCreateOption{ + WithRootFS(rootfs), + WithCtrNamespace(rootlessCNIInfraContainerNamespace), + WithName(rootlessCNIInfraContainerName), + WithPrivileged(true), + // label=disable doesn't work correct for a rootfs mount + // set labels manually to unconfined + WithSecLabels([]string{"user:unconfined_u", "role:system_r", "type:unconfined_t"}), + WithRestartPolicy("always"), + WithNetNS(nil, false, "slirp4netns", nil), + WithLabels(infraLabels), + } c, err := r.NewContainer(ctx, g.Config, options...) if err != nil { return nil, err @@ -334,14 +427,39 @@ func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container return c, nil } -func rootlessCNIInfraExec(c *Container, args ...string) (string, error) { - cmd := "rootless-cni-infra" +func mountRootlessCNIINfraRootfs(rootfs string) error { + if err := os.MkdirAll(rootfs, 0700); err != nil { + return err + } + // bind mount the rootfs recursive in the userns + // only the root will be read-only + if err := mount.Mount("/", rootfs, "bind", "rbind,rprivate,ro"); err != nil { + return errors.Wrapf(err, "failed to mount rootfs for %s", rootlessCNIInfraContainerName) + } + return nil +} + +func rootlessCNIInfraExec(c *Container, stdin []byte, args ...string) ([]byte, error) { + cmd := rootlesscni.InfraCmd + labels := c.Labels() + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; !ok { + // the old infra container had a different exec cmd + // change it for backwarts compatibility + cmd = rootlessCNIInfraContainerName + } var ( outB bytes.Buffer errB bytes.Buffer streams define.AttachStreams config ExecConfig ) + + if len(stdin) > 0 { + logrus.Debugf("rootlessCNIInfraExec: stdin=%s", string(stdin)) + r := bufio.NewReader(bytes.NewReader(stdin)) + streams.InputStream = r + streams.AttachInput = true + } streams.OutputStream = &nopWriteCloser{Writer: &outB} streams.ErrorStream = &nopWriteCloser{Writer: &errB} streams.AttachOutput = true @@ -354,13 +472,13 @@ func rootlessCNIInfraExec(c *Container, args ...string) (string, error) { logrus.Debugf("rootlessCNIInfraExec: c.ID()=%s, config=%+v, streams=%v, end (code=%d, err=%v)", c.ID(), config, streams, code, err) if err != nil { - return "", err + return nil, err } if code != 0 { - return "", errors.Errorf("command %s %v in container %s failed with status %d, stdout=%q, stderr=%q", + return nil, errors.Errorf("command %s %v in container %s failed with status %d, stdout=%q, stderr=%q", cmd, args, c.ID(), code, outB.String(), errB.String()) } - return outB.String(), nil + return outB.Bytes(), nil } type nopWriteCloser struct { diff --git a/pkg/rootless/cni/rootless_cni.go b/pkg/rootless/cni/rootless_cni.go new file mode 100644 index 0000000000..9d1e3b23a1 --- /dev/null +++ b/pkg/rootless/cni/rootless_cni.go @@ -0,0 +1,391 @@ +package cni + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "path" + "strconv" + "syscall" + "time" + + "github.com/containernetworking/cni/libcni" + "github.com/containers/storage/pkg/reexec" + "github.com/pkg/errors" + "github.com/vishvananda/netlink" +) + +const ( + // InfraCmd - always use a absolute path we should not rely on $PATH + // this also has to be a user writable location and /run is writable for the user + InfraCmd = "/run/rootless-cni-infra-exe" + infraCreateNetNSCmd = "rootless-cni-infra-create-netns" + basePath = "/run/rootless-cni-infra" + // Version - you should bump the Version if you do breaking changes to this script + Version = 6 +) + +// Config passed via stdin as json +type Config struct { + // ID - container ID + ID string + // Network - network name + Network string + // CNIPodName - name used for the dns entry by the dnsname plugin + CNIPodName string + // IP - static IP address + IP string + // MAC - static mac address + MAC string + // Aliases - network aliases, further dns entries for the dnsname plugin + Aliases map[string][]string + // InterfaceName - network interface name in the container for this network (e.g eth0) + InterfaceName string + // PluginPaths - search paths for the cni plugins + PluginPaths []string + // NetConfPath - path where the cni config files are located + NetConfPath string +} + +// PrintNetnsPath is returned by print-netns-path as json +type PrintNetnsPath struct { + Path string `json:"path"` +} + +// IsIdle is returned by is-idle as json +type IsIdle struct { + Idle bool `json:"idle"` +} + +func printErrorf(format string, a ...interface{}) { + fmt.Fprintf(os.Stderr, "Error: "+format+"\n", a...) +} + +func printJSONResult(v interface{}) { + b, err := json.Marshal(v) + if err != nil { + printErrorf("%s", err) + } + fmt.Println(string(b)) +} + +func init() { + reexec.Register(InfraCmd, func() { + if len(os.Args) < 2 { + exit(errors.Errorf("%s requires at least one arg", InfraCmd)) + } + + switch os.Args[1] { + case "alloc": + alloc() + + case "dealloc": + dealloc() + + case "is-idle": + idle := IsIdle{ + Idle: false, + } + empty, err := dirIsEmpty(basePath) + if os.IsNotExist(err) || empty { + idle.Idle = true + } else if err != nil { + printErrorf("%s", err) + } + printJSONResult(idle) + + case "print-netns-path": + if len(os.Args) != 3 { + exit(errors.Errorf("%s print-netns-path requires one arg", InfraCmd)) + } + pidfile := path.Join(basePath, os.Args[2], "pid") + path, err := getNetNamespacePath(pidfile) + if err != nil { + exit(err) + } + var netns PrintNetnsPath + netns.Path = path + printJSONResult(netns) + + case "sleep": + // sleep subcommand used to keep the namespace alive + // sleep max duration + time.Sleep(time.Duration(1<<63 - 1)) + + default: + exit(errors.Errorf("Unknown command: %s %s", InfraCmd, os.Args[1])) + } + }) + + reexec.Register(infraCreateNetNSCmd, func() { + if len(os.Args) != 2 { + exit(errors.Errorf("%s requires one arg", infraCreateNetNSCmd)) + } + pidfile := os.Args[1] + if err := os.MkdirAll(path.Dir(pidfile), 0700); err != nil { + exit(err) + } + // create new net namespace + if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil { + exit(err) + } + + // background process to keep the net namespace alive + sleep := reexec.Command(InfraCmd, "sleep") + if err := sleep.Start(); err != nil { + exit(err) + } + pid := sleep.Process.Pid + stringPid := strconv.Itoa(pid) + + if err := ioutil.WriteFile(pidfile, []byte(stringPid), 0700); err != nil { + exit(errors.Wrap(err, "failed to write pid file")) + } + + // set the loopback adapter up + lo, err := netlink.LinkByName("lo") + if err != nil { + exit(errors.Wrap(err, "failed to get the loopback adapter")) + } + if err = netlink.LinkSetUp(lo); err != nil { + exit(errors.Wrap(err, "failed to set the loopback adapter up")) + } + }) +} + +// exit with ec 0 if error is nil otherwise exit with ec 1 and log the error to stderr +func exit(err error) { + if err != nil { + printErrorf("%s", err) + os.Exit(1) + } + os.Exit(0) +} + +func dirIsEmpty(name string) (bool, error) { + f, err := os.Open(name) + if err != nil { + return false, err + } + defer f.Close() + + names, err := f.Readdirnames(1) + // Readdirnames returns EOF error if it is empty + if len(names) == 0 && err == io.EOF { + return true, nil + } + return false, err +} + +// readConfigFromStdin reads the config from stdin +func readConfigFromStdin() (*Config, error) { + var config Config + stat, err := os.Stdin.Stat() + if err != nil { + return nil, errors.Wrapf(err, "unable to read from stdin") + } + if stat.Mode()&os.ModeNamedPipe == 0 { + return nil, errors.New("nothing to read from stdin") + } + b, err := ioutil.ReadAll(os.Stdin) + if err != nil { + return nil, err + } + err = json.Unmarshal(b, &config) + if err != nil { + return nil, errors.Wrap(err, "failed to read RootlessCNIConfig json") + } + return &config, nil +} + +func getNetNamespacePath(pidfile string) (string, error) { + b, err := ioutil.ReadFile(pidfile) + if err != nil { + return "", errors.Wrap(err, "failed to read pid file") + } + pid := string(b) + return path.Join("/proc", pid, "ns", "net"), err +} + +func createNetNamespace(pidfile string) (string, error) { + rcmd := reexec.Command(infraCreateNetNSCmd, pidfile) + rcmd.Stderr = os.Stderr + rcmd.Stdout = os.Stdout + if err := rcmd.Run(); err != nil { + return "", errors.Wrap(err, "failed to create network namespace") + } + return getNetNamespacePath(pidfile) +} + +func createCNIconfigs(cfg *Config) (*libcni.CNIConfig, *libcni.NetworkConfigList, *libcni.RuntimeConf) { + args := [][2]string{ + {"IgnoreUnknown", "1"}, + {"K8S_POD_NAME", cfg.CNIPodName}, + } + // add static ip if given + if cfg.IP != "" { + args = append(args, [2]string{"IP", cfg.IP}) + } + // add static mac if given + if cfg.MAC != "" { + args = append(args, [2]string{"MAC", cfg.MAC}) + } + + // add aliases + capabilityArgs := make(map[string]interface{}) + if len(cfg.Aliases) > 0 { + capabilityArgs["aliases"] = cfg.Aliases + } + + rt := &libcni.RuntimeConf{ + ContainerID: cfg.ID, + IfName: cfg.InterfaceName, + Args: args, + CapabilityArgs: capabilityArgs, + } + + netconf, err := libcni.LoadConfList(cfg.NetConfPath, cfg.Network) + if err != nil { + cleanupErr := cleanupFiles(getPaths(cfg.ID, cfg.Network)) + printErrorf("%v", cleanupErr) + exit(err) + } + + cninet := libcni.NewCNIConfig(cfg.PluginPaths, nil) + + return cninet, netconf, rt +} + +func getPaths(cid, net string) (string, string) { + base := path.Join(basePath, cid) + pidfile := path.Join(base, "pid") + netfile := path.Join(base, "networks", net) + return pidfile, netfile +} + +func alloc() { + conf, err := readConfigFromStdin() + if err != nil { + exit(err) + } + pidfile, netfile := getPaths(conf.ID, conf.Network) + ns, err := getNetNamespacePath(pidfile) + if err != nil && !os.IsNotExist(errors.Cause(err)) { + exit(err) + } + // if namespace path does not exists create new namespace + if os.IsNotExist(errors.Cause(err)) { + ns, err = createNetNamespace(pidfile) + if err != nil { + exit(err) + } + } + + if err := os.MkdirAll(path.Dir(netfile), 0700); err != nil { + exit(err) + } + // create a file to keep track of the attached networks + _, err = os.Create(netfile) + if err != nil { + exit(err) + } + + // prepare the cni configs + cninet, netconf, rt := createCNIconfigs(conf) + rt.NetNS = ns + + // call cni to add the network + res, err := cninet.AddNetworkList(context.TODO(), netconf, rt) + if err != nil { + // cleanup to make sure we don't have dangling files + // this is important to detect is-idle correctly + cleanupErr := cleanupFiles(pidfile, netfile) + if cleanupErr != nil { + printErrorf("%v", cleanupErr) + } + exit(errors.Wrapf(err, "failed to attach to cni network %s", conf.Network)) + } + // print res to stdout + res.Print() +} + +func dealloc() { + conf, err := readConfigFromStdin() + if err != nil { + exit(err) + } + pidfile, netfile := getPaths(conf.ID, conf.Network) + ns, err := getNetNamespacePath(pidfile) + if err != nil && !os.IsNotExist(err) { + exit(err) + } + if os.IsNotExist(err) { + // if the file does not exists the namespace is probably already deleted + // exit without error + exit(nil) + } + + // prepare the cni configs + cninet, netconf, rt := createCNIconfigs(conf) + rt.NetNS = ns + + // call cni to remove the network + err = cninet.DelNetworkList(context.TODO(), netconf, rt) + if err != nil { + exit(errors.Wrapf(err, "failed to detach cni network %s", conf.Network)) + } + + err = cleanupFiles(pidfile, netfile) + if err != nil { + exit(err) + } + + // print empty json result + // we have no information to return + fmt.Println("{}") +} + +func cleanupFiles(pidfile, netfile string) error { + // remove the config file + err := os.Remove(netfile) + if err != nil && !os.IsNotExist(err) { + return err + } + + // check if the config directory is empty + empty, err := dirIsEmpty(path.Dir(netfile)) + if err != nil && !os.IsNotExist(err) { + return err + } + if empty { + // if it is empty no more networks are attached to this container + // therefore kill the net namespace + var piderr error + b, err := ioutil.ReadFile(pidfile) + if err == nil { + pid, err := strconv.Atoi(string(b)) + if err == nil { + // kill the pause process which keeps the net ns alive + err = syscall.Kill(pid, syscall.SIGKILL) + if err != nil { + piderr = errors.Wrap(err, "ailed to kill the pause process") + } + } else { + piderr = errors.Wrap(err, "failed to parse the pid") + } + } else { + piderr = errors.Wrap(err, "failed to read the pid file") + } + // remove all remaining configuration files for this container + // always remove even if the pidfile parsing failed to ensure we do not have dangling files + err = os.RemoveAll(path.Dir(pidfile)) + if err != nil { + return err + } + return piderr + } + return nil +} diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats index 0d976a6af5..121149f835 100644 --- a/test/system/500-networking.bats +++ b/test/system/500-networking.bats @@ -144,13 +144,6 @@ load helpers run_podman network rm $mynetname run_podman 1 network rm $mynetname - - # rootless CNI leaves behind an image pulled by SHA, hence with no tag. - # Remove it if present; we can only remove it by ID. - run_podman images --format '{{.Id}}' rootless-cni-infra - if [ -n "$output" ]; then - run_podman rmi $output - fi } @test "podman network reload" {