From e5254329261331647300a9f2e09f9bf584911c13 Mon Sep 17 00:00:00 2001 From: luoyunhe Date: Wed, 2 Mar 2022 14:48:27 +0800 Subject: [PATCH 1/3] support dpdk hybrid node --- .../workflows/build-kube-ovn-base-dpdk.yaml | 56 ++++ Makefile | 9 + cmd/cni/cni.go | 42 +-- dist/images/Dockerfile.base-dpdk | 93 +++++++ dist/images/Dockerfile.dpdk | 21 ++ dist/images/cleanup.sh | 1 + dist/images/install.sh | 186 ++++++++++++- dist/images/ovs-dpdk-config | 2 + dist/images/start-ovs-dpdk-v2.sh | 87 ++++++ docs/dpdk-hybrid.md | 249 ++++++++++++++++++ pkg/controller/vpc_nat_gateway.go | 1 - pkg/daemon/config.go | 7 +- pkg/daemon/handler.go | 89 +++++++ pkg/daemon/ovs.go | 50 ++++ pkg/ovs/ovs-vsctl.go | 20 +- pkg/request/cniserver.go | 4 + pkg/util/const.go | 5 + 17 files changed, 895 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/build-kube-ovn-base-dpdk.yaml create mode 100644 dist/images/Dockerfile.base-dpdk create mode 100644 dist/images/Dockerfile.dpdk create mode 100644 dist/images/ovs-dpdk-config create mode 100755 dist/images/start-ovs-dpdk-v2.sh create mode 100644 docs/dpdk-hybrid.md diff --git a/.github/workflows/build-kube-ovn-base-dpdk.yaml b/.github/workflows/build-kube-ovn-base-dpdk.yaml new file mode 100644 index 00000000000..98a4c3d5f55 --- /dev/null +++ b/.github/workflows/build-kube-ovn-base-dpdk.yaml @@ -0,0 +1,56 @@ +name: Build Base DPDK +on: workflow_dispatch + +jobs: + build-amd64: + name: Build AMD64 + runs-on: ubuntu-20.04 + steps: + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Build + run: | + make base-amd64-dpdk + make base-tar-amd64-dpdk + + - name: Upload image to artifact + uses: actions/upload-artifact@v2 + with: + name: image-amd64-dpdk + path: image-amd64-dpdk.tar + + + push: + needs: + - build-amd64 + name: push + runs-on: ubuntu-20.04 + steps: + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Download image + uses: actions/download-artifact@v2 + with: + name: image-amd64-dpdk + + - name: Load Image + run: | + docker load --input image-amd64-dpdk.tar + + - name: Push + env: + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + COMMIT: ${{ github.sha }} + run: | + cat VERSION + TAG=$(cat VERSION) + echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin + docker images + docker tag kubeovn/kube-ovn-base:$TAG-amd64-dpdk kubeovn/kube-ovn-base:$TAG-dpdk + docker push kubeovn/kube-ovn-base:$TAG-dpdk \ No newline at end of file diff --git a/Makefile b/Makefile index 3531a694b11..836a1ed0191 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,10 @@ build-dpdk: base-amd64: docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-amd64 -o type=docker -f dist/images/Dockerfile.base dist/images/ +.PHONY: base-amd64-dpdk +base-amd64-dpdk: + docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-amd64-dpdk -o type=docker -f dist/images/Dockerfile.base-dpdk dist/images/ + .PHONY: base-arm64 base-arm64: docker buildx build --platform linux/arm64 --build-arg ARCH=arm64 -t $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-arm64 -o type=docker -f dist/images/Dockerfile.base dist/images/ @@ -51,6 +55,7 @@ base-arm64: .PHONY: release release: lint build-go docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/kube-ovn:$(RELEASE_TAG) -o type=docker -f dist/images/Dockerfile dist/images/ + docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/kube-ovn:$(RELEASE_TAG)-dpdk -o type=docker -f dist/images/Dockerfile.dpdk dist/images/ docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/vpc-nat-gateway:$(RELEASE_TAG) -o type=docker -f dist/images/vpcnatgateway/Dockerfile dist/images/vpcnatgateway docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/centos7-compile:$(RELEASE_TAG) -o type=docker -f dist/images/compile/centos7/Dockerfile fastpath/ # docker buildx build --platform linux/amd64 --build-arg ARCH=amd64 -t $(REGISTRY)/centos8-compile:$(RELEASE_TAG) -o type=docker -f dist/images/compile/centos8/Dockerfile fastpath/ @@ -79,6 +84,10 @@ tar: base-tar-amd64: docker save $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-amd64 -o image-amd64.tar +.PHONY: base-tar-amd64-dpdk +base-tar-amd64-dpdk: + docker save $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-amd64-dpdk -o image-amd64-dpdk.tar + .PHONY: base-tar-arm64 base-tar-arm64: docker save $(REGISTRY)/kube-ovn-base:$(RELEASE_TAG)-arm64 -o image-arm64.tar diff --git a/cmd/cni/cni.go b/cmd/cni/cni.go index 7bdddc4e5b9..9df904d6a92 100644 --- a/cmd/cni/cni.go +++ b/cmd/cni/cni.go @@ -48,16 +48,18 @@ func cmdAdd(args *skel.CmdArgs) error { client := request.NewCniServerClient(netConf.ServerSocket) response, err := client.Add(request.CniRequest{ - CniType: netConf.Type, - PodName: podName, - PodNamespace: podNamespace, - ContainerID: args.ContainerID, - NetNs: args.Netns, - IfName: args.IfName, - Provider: netConf.Provider, - Routes: netConf.Routes, - DeviceID: netConf.DeviceID, - VfDriver: netConf.VfDriver, + CniType: netConf.Type, + PodName: podName, + PodNamespace: podNamespace, + ContainerID: args.ContainerID, + NetNs: args.Netns, + IfName: args.IfName, + Provider: netConf.Provider, + Routes: netConf.Routes, + DeviceID: netConf.DeviceID, + VfDriver: netConf.VfDriver, + VhostUserSocketVolumeName: netConf.VhostUserSocketVolumeName, + VhostUserSocketName: netConf.VhostUserSocketName, }) if err != nil { return err @@ -131,14 +133,15 @@ func cmdDel(args *skel.CmdArgs) error { } return client.Del(request.CniRequest{ - CniType: netConf.Type, - PodName: podName, - PodNamespace: podNamespace, - ContainerID: args.ContainerID, - NetNs: args.Netns, - IfName: args.IfName, - Provider: netConf.Provider, - DeviceID: netConf.DeviceID, + CniType: netConf.Type, + PodName: podName, + PodNamespace: podNamespace, + ContainerID: args.ContainerID, + NetNs: args.Netns, + IfName: args.IfName, + Provider: netConf.Provider, + DeviceID: netConf.DeviceID, + VhostUserSocketVolumeName: netConf.VhostUserSocketVolumeName, }) } @@ -156,6 +159,9 @@ type netConf struct { // PciAddrs in case of using sriov DeviceID string `json:"deviceID"` VfDriver string `json:"vf_driver"` + // for dpdk + VhostUserSocketVolumeName string `json:"vhost_user_socket_volume_name"` + VhostUserSocketName string `json:"vhost_user_socket_name"` } func loadNetConf(bytes []byte) (*netConf, string, error) { diff --git a/dist/images/Dockerfile.base-dpdk b/dist/images/Dockerfile.base-dpdk new file mode 100644 index 00000000000..3024ea19c5f --- /dev/null +++ b/dist/images/Dockerfile.base-dpdk @@ -0,0 +1,93 @@ +# syntax = docker/dockerfile:experimental +FROM ubuntu:22.04 as ovs-builder + +ARG ARCH +ARG DEBIAN_FRONTEND=noninteractive +ENV SRC_DIR='/usr/src' + +RUN apt update && apt install build-essential git libnuma-dev autoconf curl \ + python3 libmnl-dev libpcap-dev libtool libcap-ng-dev libssl-dev pkg-config \ + python3-six libunbound-dev libunwind-dev dh-make fakeroot debhelper dh-python \ + flake8 python3-sphinx graphviz groff wget libjemalloc-dev python3-pip -y + +RUN pip3 install meson ninja + +RUN cd /usr/src/ && \ + wget https://fast.dpdk.org/rel/dpdk-20.11.1.tar.xz && \ + tar xf dpdk-20.11.1.tar.xz && \ + export DPDK_DIR=/usr/src/dpdk-stable-20.11.1 && \ + export DPDK_BUILD=$DPDK_DIR/build && \ + cd $DPDK_DIR && \ + meson build && \ + ninja -C build && \ + ninja -C build install && \ + ldconfig + + +RUN cd /usr/src/ && \ + git clone -b branch-2.16 --depth=1 https://github.com/openvswitch/ovs.git && \ + cd ovs && \ + curl -s https://github.com/kubeovn/ovs/commit/22ea22c40b46ee5adeae977ff6cfca81b3ff25d7.patch | git apply && \ + ./boot.sh && \ + rm -rf .git && \ + export DPDK_DIR=/usr/src/dpdk-stable-20.11.1 && \ + CONFIGURE_OPTS='LIBS=-ljemalloc' && \ + if [ "$ARCH" = "amd64" ]; then CONFIGURE_OPTS='LIBS=-ljemalloc CFLAGS="-O2 -g -msse4.2 -mpopcnt"'; fi && \ + DATAPATH_CONFIGURE_OPTS='--prefix=/usr --with-dpdk=static' EXTRA_CONFIGURE_OPTS=$CONFIGURE_OPTS DEB_BUILD_OPTIONS='parallel=8 nocheck' fakeroot debian/rules binary + +RUN dpkg -i /usr/src/python3-openvswitch*.deb /usr/src/libopenvswitch*.deb + +RUN cd /usr/src/ && git clone -b branch-21.06 --depth=1 https://github.com/ovn-org/ovn.git && \ + cd ovn && \ + curl -s https://github.com/kubeovn/ovn/commit/e24734913d25c0bffdf1cfd79e14ef43d01e1019.patch | git apply && \ + curl -s https://github.com/kubeovn/ovn/commit/8f4e4868377afb5e980856755b9f6394f8b649e2.patch | git apply && \ + curl -s https://github.com/kubeovn/ovn/commit/23a87cabb76fbdce5092a6b3d3b56f3fa8dd61f5.patch | git apply && \ + curl -s https://github.com/kubeovn/ovn/commit/89ca60989df4af9a96cc6024e04f99b9b77bad22.patch | git apply && \ + curl -s https://github.com/kubeovn/ovn/commit/aeafa43fc51be8ea1c7abfbe779c69205c1c5aa4.patch | git apply && \ + curl -s https://github.com/kubeovn/ovn/commit/71f831b9cc5a6dc923af4ca90286857e2cf8b1d3.patch | git apply && \ + sed -i 's/OVN/ovn/g' debian/changelog && \ + rm -rf .git && \ + ./boot.sh && \ + CONFIGURE_OPTS='LIBS=-ljemalloc' && \ + if [ "$ARCH" = "amd64" ]; then CONFIGURE_OPTS='LIBS=-ljemalloc CFLAGS="-O2 -g -msse4.2 -mpopcnt"'; fi && \ + OVSDIR=/usr/src/ovs EXTRA_CONFIGURE_OPTS=$CONFIGURE_OPTS DEB_BUILD_OPTIONS='parallel=8 nocheck' fakeroot debian/rules binary + +RUN mkdir /packages/ && \ + cp /usr/src/libopenvswitch*.deb /packages && \ + cp /usr/src/openvswitch-*.deb /packages && \ + cp /usr/src/python3-openvswitch*.deb /packages && \ + cp /usr/src/ovn-*.deb /packages && \ + cd /packages && rm -f *dbg* *datapath* *docker* *vtep* *ipsec* *test* *dev* + +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install ca-certificates python3 hostname libunwind8 netbase \ + ethtool iproute2 ncat libunbound-dev procps libatomic1 kmod iptables \ + tcpdump ipset curl uuid-runtime openssl inetutils-ping arping ndisc6 \ + logrotate libjemalloc2 dnsutils libnuma-dev -y --no-install-recommends && \ + rm -rf /var/lib/apt/lists/* && \ + cd /usr/sbin && \ + ln -sf /usr/sbin/iptables-legacy iptables && \ + ln -sf /usr/sbin/ip6tables-legacy ip6tables && \ + rm -rf /etc/localtime + +RUN mkdir -p /var/run/openvswitch && \ + mkdir -p /var/run/ovn && \ + mkdir -p /etc/cni/net.d && \ + mkdir -p /opt/cni/bin + +ARG ARCH +ENV CNI_VERSION=v0.8.7 +RUN curl -sSf -L --retry 5 https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-${ARCH}-${CNI_VERSION}.tgz | tar -xz -C . ./loopback ./portmap ./macvlan + +ENV KUBE_VERSION="v1.21.1" + +RUN curl -L https://dl.k8s.io/${KUBE_VERSION}/kubernetes-client-linux-${ARCH}.tar.gz | tar -xz -C . && cp ./kubernetes/client/bin/kubectl /usr/bin/kubectl \ + && chmod +x /usr/bin/kubectl && rm -rf ./kubernetes + +RUN --mount=type=bind,target=/packages,from=ovs-builder,source=/packages \ + dpkg -i /packages/libopenvswitch*.deb && \ + dpkg -i /packages/openvswitch-*.deb && \ + dpkg -i /packages/python3-openvswitch*.deb &&\ + dpkg -i --ignore-depends=openvswitch-switch,openvswitch-common /packages/ovn-*.deb \ No newline at end of file diff --git a/dist/images/Dockerfile.dpdk b/dist/images/Dockerfile.dpdk new file mode 100644 index 00000000000..86fcac6b29b --- /dev/null +++ b/dist/images/Dockerfile.dpdk @@ -0,0 +1,21 @@ +# syntax = docker/dockerfile:experimental +FROM kubeovn/kube-ovn-base:v1.10.0-dpdk + +COPY *.sh /kube-ovn/ +COPY kubectl-ko /kube-ovn/kubectl-ko +COPY 01-kube-ovn.conflist /kube-ovn/01-kube-ovn.conflist +COPY logrotate/* /etc/logrotate.d/ +COPY grace_stop_ovn_controller /usr/share/ovn/scripts/grace_stop_ovn_controller + +WORKDIR /kube-ovn + +COPY kube-ovn /kube-ovn/kube-ovn +COPY kube-ovn-cmd /kube-ovn/kube-ovn-cmd +RUN ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-controller && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-daemon && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-monitor && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-pinger && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-speaker && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-webhook && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-controller-healthcheck && \ + ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-leader-checker \ No newline at end of file diff --git a/dist/images/cleanup.sh b/dist/images/cleanup.sh index e27e0e27eda..243e928a9ca 100644 --- a/dist/images/cleanup.sh +++ b/dist/images/cleanup.sh @@ -49,6 +49,7 @@ done kubectl delete --ignore-not-found svc ovn-nb ovn-sb ovn-northd -n kube-system kubectl delete --ignore-not-found deploy ovn-central -n kube-system kubectl delete --ignore-not-found ds ovs-ovn -n kube-system +kubectl delete --ignore-not-found ds ovs-ovn-dpdk -n kube-system kubectl delete --ignore-not-found secret kube-ovn-tls -n kube-system kubectl delete --ignore-not-found sa ovn -n kube-system kubectl delete --ignore-not-found clusterrole system:ovn diff --git a/dist/images/install.sh b/dist/images/install.sh index d272d05c1ef..3a95592bf11 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -67,6 +67,9 @@ if [ "$ENABLE_VLAN" = "true" ]; then fi fi +# hybrid dpdk +HYBRID_DPDK="false" + # DPDK DPDK="false" DPDK_SUPPORTED_VERSIONS=("19.11") @@ -82,6 +85,7 @@ display_help() { echo "Usage: $0 [option...]" echo echo " -h, --help Print Help (this message) and exit" + echo " --with-hybrid-dpdk Install Kube-OVN with nodes which run ovs-dpdk or ovs-kernel" echo " --with-dpdk= Install Kube-OVN with OVS-DPDK instead of kernel OVS" echo " --dpdk-cpu=m Configure DPDK to use a specific amount of CPU" echo " --dpdk-memory=Gi Configure DPDK to use a specific amount of memory" @@ -97,6 +101,9 @@ then -h|--help) display_help ;; + --with-hybrid-dpdk) + HYBRID_DPDK="true" + ;; --with-dpdk=*) DPDK=true DPDK_VERSION="${1#*=}" @@ -163,7 +170,7 @@ if [[ $ENABLE_SSL = "true" ]];then echo "" fi -echo "[Step 1/6] Label kube-ovn-master node" +echo "[Step 1/6] Label kube-ovn-master node and label datapath type" count=$(kubectl get no -l$LABEL --no-headers -o wide | wc -l | sed 's/ //g') if [ "$count" = "0" ]; then echo "ERROR: No node with label $LABEL" @@ -171,6 +178,9 @@ if [ "$count" = "0" ]; then fi kubectl label no -lbeta.kubernetes.io/os=linux kubernetes.io/os=linux --overwrite kubectl label no -l$LABEL kube-ovn/role=master --overwrite + +kubectl label no -lovn.kubernetes.io/ovs_dp_type!=userspace ovn.kubernetes.io/ovs_dp_type=kernel --overwrite + echo "-------------------------------" echo "" @@ -1319,6 +1329,7 @@ spec: hugepages-1Gi: 1Gi nodeSelector: kubernetes.io/os: "linux" + ovn.kubernetes.io/ovs_dp_type: "kernel" volumes: - name: host-modules hostPath: @@ -1780,6 +1791,7 @@ spec: memory: 800Mi nodeSelector: kubernetes.io/os: "linux" + ovn.kubernetes.io/ovs_dp_type: kernel volumes: - name: host-modules hostPath: @@ -1820,6 +1832,172 @@ fi kubectl apply -f kube-ovn-crd.yaml kubectl apply -f ovn.yaml + +if $HYBRID_DPDK; then + +cat < ovn-dpdk.yaml +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: ovs-ovn-dpdk + namespace: kube-system + annotations: + kubernetes.io/description: | + This daemon set launches the openvswitch daemon. +spec: + selector: + matchLabels: + app: ovs-dpdk + updateStrategy: + type: OnDelete + template: + metadata: + labels: + app: ovs-dpdk + component: network + type: infra + spec: + tolerations: + - operator: Exists + priorityClassName: system-cluster-critical + serviceAccountName: ovn + hostNetwork: true + hostPID: true + containers: + - name: openvswitch + image: "$REGISTRY/kube-ovn:${VERSION}-dpdk" + imagePullPolicy: $IMAGE_PULL_POLICY + command: ["/kube-ovn/start-ovs-dpdk-v2.sh"] + securityContext: + runAsUser: 0 + privileged: true + env: + - name: ENABLE_SSL + value: "$ENABLE_SSL" + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: HW_OFFLOAD + value: "$HW_OFFLOAD" + - name: TUNNEL_TYPE + value: "$TUNNEL_TYPE" + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: OVN_DB_IPS + value: $addresses + volumeMounts: + - mountPath: /opt/ovs-config + name: host-config-ovs + - name: shareddir + mountPath: /var/lib/kubelet/pods + - name: hugepage + mountPath: /dev/hugepages + - mountPath: /lib/modules + name: host-modules + readOnly: true + - mountPath: /var/run/openvswitch + name: host-run-ovs + mountPropagation: HostToContainer + - mountPath: /var/run/ovn + name: host-run-ovn + - mountPath: /sys + name: host-sys + readOnly: true + - mountPath: /etc/cni/net.d + name: cni-conf + - mountPath: /etc/openvswitch + name: host-config-openvswitch + - mountPath: /etc/ovn + name: host-config-ovn + - mountPath: /var/log/openvswitch + name: host-log-ovs + - mountPath: /var/log/ovn + name: host-log-ovn + - mountPath: /etc/localtime + name: localtime + - mountPath: /var/run/tls + name: kube-ovn-tls + readinessProbe: + exec: + command: + - bash + - -c + - LOG_ROTATE=true /kube-ovn/ovs-healthcheck.sh + periodSeconds: 5 + timeoutSeconds: 45 + livenessProbe: + exec: + command: + - bash + - /kube-ovn/ovs-healthcheck.sh + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 5 + timeoutSeconds: 45 + resources: + requests: + cpu: 200m + hugepages-2Mi: 1Gi + memory: 200Mi + limits: + cpu: 1000m + hugepages-2Mi: 1Gi + memory: 800Mi + nodeSelector: + kubernetes.io/os: "linux" + ovn.kubernetes.io/ovs_dp_type: "userspace" + volumes: + - name: host-config-ovs + hostPath: + path: /opt/ovs-config + type: DirectoryOrCreate + - name: shareddir + hostPath: + path: /var/lib/kubelet/pods + type: '' + - name: hugepage + emptyDir: + medium: HugePages + - name: host-modules + hostPath: + path: /lib/modules + - name: host-run-ovs + hostPath: + path: /run/openvswitch + - name: host-run-ovn + hostPath: + path: /run/ovn + - name: host-sys + hostPath: + path: /sys + - name: cni-conf + hostPath: + path: /etc/cni/net.d + - name: host-config-openvswitch + hostPath: + path: /etc/origin/openvswitch + - name: host-config-ovn + hostPath: + path: /etc/origin/ovn + - name: host-log-ovs + hostPath: + path: /var/log/openvswitch + - name: host-log-ovn + hostPath: + path: /var/log/ovn + - name: localtime + hostPath: + path: /etc/localtime + - name: kube-ovn-tls + secret: + optional: true + secretName: kube-ovn-tls +EOF +kubectl apply -f ovn-dpdk.yaml +fi kubectl rollout status deployment/ovn-central -n kube-system --timeout 300s echo "-------------------------------" echo "" @@ -2029,12 +2207,15 @@ spec: - name: RPMS value: $RPMS volumeMounts: + - name: shared-dir + mountPath: /var/lib/kubelet/pods - mountPath: /etc/openvswitch name: systemid - mountPath: /etc/cni/net.d name: cni-conf - mountPath: /run/openvswitch name: host-run-ovs + mountPropagation: Bidirectional - mountPath: /run/ovn name: host-run-ovn - mountPath: /var/run/netns @@ -2072,6 +2253,9 @@ spec: nodeSelector: kubernetes.io/os: "linux" volumes: + - name: shared-dir + hostPath: + path: /var/lib/kubelet/pods - name: systemid hostPath: path: /etc/origin/openvswitch diff --git a/dist/images/ovs-dpdk-config b/dist/images/ovs-dpdk-config new file mode 100644 index 00000000000..05f77e57525 --- /dev/null +++ b/dist/images/ovs-dpdk-config @@ -0,0 +1,2 @@ +ENCAP_IP=192.168.190.2 +DPDK_DEV=0000:00:0b.0 diff --git a/dist/images/start-ovs-dpdk-v2.sh b/dist/images/start-ovs-dpdk-v2.sh new file mode 100755 index 00000000000..a5f3f67aa23 --- /dev/null +++ b/dist/images/start-ovs-dpdk-v2.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -euo pipefail + +OVS_DPDK_CONFIG_FILE=/opt/ovs-config/ovs-dpdk-config +if ! test -f "$OVS_DPDK_CONFIG_FILE"; then + echo "missing ovs dpdk config" + exit 1 +fi +source $OVS_DPDK_CONFIG_FILE + + +export PATH=$PATH:/usr/share/openvswitch/scripts +export PATH=$PATH:/usr/share/ovn/scripts + +function quit { + ovs-ctl stop + ovn-ctl stop_controller + exit 0 +} +trap quit EXIT + +CONFIG_FILE=/opt/ovs-config/config.cfg + +# Check if config file exists, create default one if not +if ! test -f "$CONFIG_FILE"; then + mkdir -p $(dirname ${CONFIG_FILE}) + printf %s\\n {dpdk-socket-mem=\"1024\",dpdk-init=true,dpdk-hugepage-dir=/dev/hugepages} > $CONFIG_FILE +fi + +# Start ovsdb +ovs-ctl restart --no-ovs-vswitchd --system-id=random + +# Restrict the number of pthreads ovs-vswitchd creates to reduce the +# amount of RSS it uses on hosts with many cores +# https://bugzilla.redhat.com/show_bug.cgi?id=1571379 +# https://bugzilla.redhat.com/show_bug.cgi?id=1572797 +if [[ `nproc` -gt 12 ]]; then + ovs-vsctl --no-wait set Open_vSwitch . other_config:n-revalidator-threads=4 + ovs-vsctl --no-wait set Open_vSwitch . other_config:n-handler-threads=10 +fi + +# Read the config and setup OVS +while IFS= read -r config_line +do + if [[ $config_line ]] && [[ $config_line != \#* ]] ; then + ovs-vsctl --no-wait set Open_vSwitch . other_config:$config_line + fi +done < "$CONFIG_FILE" + +# Start vswitchd +ovs-ctl restart --no-ovsdb-server --system-id=random +ovs-ctl --protocol=udp --dport=6081 enable-protocol + + + +if ! ovs-vsctl br-exists br-phy; then +ovs-vsctl --may-exist add-br br-phy \ + -- set Bridge br-phy datapath_type=netdev \ + -- br-set-external-id br-phy bridge-id br-phy \ + -- set bridge br-phy fail-mode=standalone + +ovs-vsctl --timeout 10 add-port br-phy dpdk0 \ + -- set Interface dpdk0 type=dpdk options:dpdk-devargs=${DPDK_DEV} + +ip addr add ${ENCAP_IP} dev br-phy +fi + +ip link set br-phy up + +ovs-vsctl --may-exist add-br br-int \ + -- set Bridge br-int datapath_type=netdev \ + -- br-set-external-id br-int bridge-id br-int \ + -- set bridge br-int fail-mode=secure + + + +# Start ovn-controller +ovn-ctl restart_controller + +# Set remote ovn-sb for ovn-controller to connect to +ovs-vsctl set open . external-ids:ovn-remote=tcp:"${OVN_SB_SERVICE_HOST}":"${OVN_SB_SERVICE_PORT}" +ovs-vsctl set open . external-ids:ovn-remote-probe-interval=10000 +ovs-vsctl set open . external-ids:ovn-openflow-probe-interval=180 +ovs-vsctl set open . external-ids:ovn-encap-type=geneve + +tail -f /var/log/openvswitch/ovs-vswitchd.log \ No newline at end of file diff --git a/docs/dpdk-hybrid.md b/docs/dpdk-hybrid.md new file mode 100644 index 00000000000..7337cabb486 --- /dev/null +++ b/docs/dpdk-hybrid.md @@ -0,0 +1,249 @@ +# Kube-OVN with nodes which run ovs-dpdk or ovs-kernel + +This document describes how to run Kube-OVN with nodes which run ovs-dpdk or ovs-kernel + +## Prerequisite +Node which run ovs-dpdk must have a net card bound to the dpdk driver. +Hugepages on the host. +## Label nodes used run ovs-dpdk +```bash +kubectl label nodes ovn.kubernetes.io/ovs_dp_type="userspace" +``` +## Set up net card +We use driverctl to persist the device driver configuration. +Here is an example to bind dpdk driver to a net card. +```bash +driverctl set-override 0000:00:0b.0 uio_pci_generic +``` +For other drivers, please refer to https://www.dpdk.org/ + +## configrue node +Edit the configuration file named ovs-dpdk-config on the node that needs to run ovs-dpdk. The configuration file needs to be placed in the /opt/ovs-config directory. +```bash +# specify encap IP +ENCAP_IP=192.168.122.193/24 +# specify pci device +DPDK_DEV=0000:00:0b.0 +``` + + +## Set up Kube-OVN +Just run install.sh --with-hybrid-dpdk + +## How to use +Here is an example to create a vhost-user app to use userspace datapath. We create a virtual machine using vhostuser and test if it can access the Internet. + +Install the KVM device plugin for creating virtual machines. More information is available through this website. https://github.com/kubevirt/kubernetes-device-plugins/blob/master/docs/README.kvm.md +```bash +kubectl apply -f manifests/kvm-ds.yml +``` + +Create NetworkAttachmentDefinition +```yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + name: ovn-dpdk + namespace: default +spec: + config: >- + {"cniVersion": "0.3.0", "type": "kube-ovn", "server_socket": + "/run/openvswitch/kube-ovn-daemon.sock", "provider": "ovn-dpdk.default.ovn", + "vhost_user_socket_volume_name": "vhostuser-sockets", + "vhost_user_socket_name": "sock"} +``` +Create a virtual machine image and tag it as vm-vhostuser:latest +```bash +docker build . -t vm-vhostuser:latest +``` +```dockerfile +From quay.io/kubevirt/virt-launcher:v0.46.1 + +# wget http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud.qcow2 +COPY CentOS-7-x86_64-GenericCloud.qcow2 /var/lib/libvirt/images/CentOS-7-x86_64-GenericCloud.qcow2 +``` +Create virtual machine. +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: vm-config +data: + start.sh: | + chmod u+w /etc/libvirt/qemu.conf + echo "hugetlbfs_mount = \"/dev/hugepages\"" >> /etc/libvirt/qemu.conf + virtlogd & + libvirtd & + + mkdir /var/lock + + sleep 5 + + virsh define /root/vm/vm.xml + virsh start vm + + tail -f /dev/null + vm.xml: | + + vm + 4a9b3f53-fa2a-47f3-a757-dd87720d9d1d + 2097152 + 2097152 + + + + + + 2 + + 4096 + + + + + + hvm + + + + + + + + + + + + + + restart + + /usr/libexec/qemu-kvm + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vm-deployment + labels: + app: vm +spec: + replicas: 1 + selector: + matchLabels: + app: vm + template: + metadata: + labels: + app: vm + annotations: + k8s.v1.cni.cncf.io/networks: default/ovn-dpdk + ovn-dpdk.default.ovn.kubernetes.io/ip_address: 10.16.0.96 + ovn-dpdk.default.ovn.kubernetes.io/mac_address: 00:00:00:0A:30:89 + spec: + nodeSelector: + ovn.kubernetes.io/ovs_dp_type: userspace + securityContext: + runAsUser: 0 + volumes: + - name: vhostuser-sockets + emptyDir: {} + - name: xml + configMap: + name: vm-config + - name: hugepage + emptyDir: + medium: HugePages-2Mi + - name: libvirt-runtime + emptyDir: {} + containers: + - name: vm + image: vm-vhostuser:latest + command: ["bash", "/root/vm/start.sh"] + securityContext: + capabilities: + add: + - NET_BIND_SERVICE + - SYS_NICE + - NET_RAW + - NET_ADMIN + privileged: false + runAsUser: 0 + resources: + limits: + cpu: '2' + devices.kubevirt.io/kvm: '1' + memory: '8784969729' + hugepages-2Mi: 2Gi + requests: + cpu: 666m + devices.kubevirt.io/kvm: '1' + ephemeral-storage: 50M + memory: '4490002433' + volumeMounts: + - name: vhostuser-sockets + mountPath: /var/run/vm + - name: xml + mountPath: /root/vm/ + - mountPath: /dev/hugepages + name: hugepage + - name: libvirt-runtime + mountPath: /var/run/libvirt +``` +After waiting for the Pod of the virtual machine to start, attach shell in to the Pod. +```bash +# set vm root password +bash-5.0# virsh set-user-password vm root 12345 +Password set successfully for root in vm + +# console to vm +bash-5.0# virsh console vm +Connected to domain 'vm' +Escape character is ^] (Ctrl + ]) + +CentOS Linux 7 (Core) +Kernel 3.10.0-1127.el7.x86_64 on an x86_64 + +localhost login: root +Password: +Last login: Fri Feb 25 09:52:54 on ttyS0 +[root@localhost ~]# +``` +Now you have logged in the virtual machine, you can access the Internet after configuring the IP and routing entries. +```bash +ip link set eth0 mtu 1400 +ip addr add 10.16.0.96/16 dev eth0 +ip ro add default via 10.16.0.1 +ping 114.114.114.114 +``` + diff --git a/pkg/controller/vpc_nat_gateway.go b/pkg/controller/vpc_nat_gateway.go index 6fcd2f0fada..74a2d32695e 100644 --- a/pkg/controller/vpc_nat_gateway.go +++ b/pkg/controller/vpc_nat_gateway.go @@ -308,7 +308,6 @@ func (c *Controller) syncVpcNatGwRules(key string) error { c.updateVpcFloatingIpQueue.Add(key) c.updateVpcDnatQueue.Add(key) c.updateVpcSnatQueue.Add(key) - c.updateVpcSnatQueue.Add(key) c.updateVpcSubnetQueue.Add(key) return nil } diff --git a/pkg/daemon/config.go b/pkg/daemon/config.go index f82b86a0428..ff4209eabe5 100644 --- a/pkg/daemon/config.go +++ b/pkg/daemon/config.go @@ -133,7 +133,7 @@ func (config *Configuration) initNicConfig(nicBridgeMappings map[string]string) encapIP string ) - //Support to specify node network card separately + // Support to specify node network card separately node, err := config.KubeClient.CoreV1().Nodes().Get(context.Background(), config.NodeName, metav1.GetOptions{}) if err != nil { klog.Errorf("Failed to find node info, err: %v", err) @@ -144,6 +144,11 @@ func (config *Configuration) initNicConfig(nicBridgeMappings map[string]string) klog.Infof("Find node tunnel interface name: %v", nodeTunnelName) } + isDPDKNode := node.GetLabels()[util.OvsDpTypeLabel] == "userspace" + + if isDPDKNode { + config.Iface = "br-phy" + } if config.Iface == "" { podIP, ok := os.LookupEnv("POD_IP") if !ok || podIP == "" { diff --git a/pkg/daemon/handler.go b/pkg/daemon/handler.go index 27fab31769b..6cad40e9e44 100644 --- a/pkg/daemon/handler.go +++ b/pkg/daemon/handler.go @@ -4,15 +4,19 @@ import ( "context" "fmt" "net/http" + "os" + "path" "strconv" "strings" "time" "github.com/emicklei/go-restful/v3" + "golang.org/x/sys/unix" v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" @@ -119,6 +123,16 @@ func (csh cniServerHandler) handleAdd(req *restful.Request, resp *restful.Respon } if podRequest.DeviceID != "" { nicType = util.OffloadType + } else if podRequest.VhostUserSocketVolumeName != "" { + nicType = util.DpdkType + err = createShortSharedDir(pod, podRequest.VhostUserSocketVolumeName) + if err != nil { + klog.Error(err.Error()) + if err = resp.WriteHeaderAndEntity(http.StatusInternalServerError, request.CniResponse{Err: err.Error()}); err != nil { + klog.Errorf("failed to write response: %v", err) + } + return + } } else { nicType = pod.Annotations[fmt.Sprintf(util.PodNicAnnotationTemplate, podRequest.Provider)] } @@ -208,6 +222,8 @@ func (csh cniServerHandler) handleAdd(req *restful.Request, resp *restful.Respon klog.Infof("create container interface %s mac %s, ip %s, cidr %s, gw %s, custom routes %v", ifName, macAddr, ipAddr, cidr, gw, podRequest.Routes) if nicType == util.InternalType { podNicName, err = csh.configureNicWithInternalPort(podRequest.PodName, podRequest.PodNamespace, podRequest.Provider, podRequest.NetNs, podRequest.ContainerID, ifName, macAddr, mtu, ipAddr, gw, isDefaultRoute, podRequest.Routes, ingress, egress, priority, podRequest.DeviceID, nicType, gatewayCheckMode) + } else if nicType == util.DpdkType { + err = csh.configureDpdkNic(podRequest.PodName, podRequest.PodNamespace, podRequest.Provider, podRequest.NetNs, podRequest.ContainerID, ifName, macAddr, mtu, ipAddr, gw, ingress, egress, priority, path.Join("/var", getShortSharedDir(pod.UID, podRequest.VhostUserSocketVolumeName)), podRequest.VhostUserSocketName) } else { podNicName = ifName err = csh.configureNic(podRequest.PodName, podRequest.PodNamespace, podRequest.Provider, podRequest.NetNs, podRequest.ContainerID, podRequest.VfDriver, ifName, macAddr, mtu, ipAddr, gw, isDefaultRoute, podRequest.Routes, ingress, egress, priority, podRequest.DeviceID, nicType, gatewayCheckMode) @@ -242,6 +258,68 @@ func (csh cniServerHandler) handleAdd(req *restful.Request, resp *restful.Respon } } +func createShortSharedDir(pod *v1.Pod, volumeName string) (err error) { + var volume *v1.Volume + for index, v := range pod.Spec.Volumes { + if v.Name == volumeName { + volume = &pod.Spec.Volumes[index] + break + } + } + if volume == nil { + return fmt.Errorf("can not fount volume %s in pod %s", volumeName, pod.Name) + } + if volume.EmptyDir == nil { + return fmt.Errorf("volume %s is not empty dir", volume.Name) + } + originSharedDir := fmt.Sprintf("/var/lib/kubelet/pods/%s/volumes/kubernetes.io~empty-dir/%s", pod.UID, volumeName) + newSharedDir := getShortSharedDir(pod.UID, volumeName) + _, err = os.Stat(newSharedDir) + if os.IsNotExist(err) { + err = os.MkdirAll(newSharedDir, 0750) + if err != nil { + return fmt.Errorf("createSharedDir: Failed to create dir (%s): %v", newSharedDir, err) + } + + if strings.Contains(newSharedDir, util.DefaultHostVhostuserBaseDir) { + klog.Infof("createSharedDir: Mount from %s to %s", originSharedDir, newSharedDir) + err = unix.Mount(originSharedDir, newSharedDir, "", unix.MS_BIND, "") + if err != nil { + return fmt.Errorf("createSharedDir: Failed to bind mount: %s", err) + } + } + return nil + + } + return err + +} + +func removeShortSharedDir(pod *v1.Pod, volumeName string) (err error) { + sharedDir := getShortSharedDir(pod.UID, volumeName) + _, err = os.Stat(sharedDir) + if os.IsNotExist(err) { + klog.Errorf("shared directory %s does not exist to unmount", sharedDir) + return nil + } + err = unix.Unmount(sharedDir, 0) + if err != nil { + klog.Errorf("Failed to unmount dir: %v", err) + return err + } + err = os.Remove(sharedDir) + if err != nil { + klog.Errorf("Failed to remove dir: %v", err) + return err + } + + return nil +} + +func getShortSharedDir(uid types.UID, volumeName string) string { + return path.Join(util.DefaultHostVhostuserBaseDir, string(uid), volumeName) +} + func (csh cniServerHandler) createOrUpdateIPCr(podRequest request.CniRequest, subnet, ip, macAddr string) error { v4IP, v6IP := util.SplitStringIP(ip) ipCrName := ovs.PodNameToPortName(podRequest.PodName, podRequest.PodNamespace, podRequest.Provider) @@ -348,6 +426,17 @@ func (csh cniServerHandler) handleDel(req *restful.Request, resp *restful.Respon var nicType string if podRequest.DeviceID != "" { nicType = util.OffloadType + } else if podRequest.VhostUserSocketVolumeName != "" { + nicType = util.DpdkType + err = removeShortSharedDir(pod, podRequest.VhostUserSocketVolumeName) + if err != nil { + klog.Error(err.Error()) + if err = resp.WriteHeaderAndEntity(http.StatusInternalServerError, request.CniResponse{Err: err.Error()}); err != nil { + klog.Errorf("failed to write response: %v", err) + } + return + } + } else { nicType = pod.Annotations[fmt.Sprintf(util.PodNicAnnotationTemplate, podRequest.Provider)] } diff --git a/pkg/daemon/ovs.go b/pkg/daemon/ovs.go index ccf8c21f43e..84af8a38af0 100644 --- a/pkg/daemon/ovs.go +++ b/pkg/daemon/ovs.go @@ -5,6 +5,8 @@ import ( "fmt" "net" "os" + "os/exec" + "path" "path/filepath" "regexp" "strings" @@ -29,6 +31,31 @@ const gatewayCheckMaxRetry = 200 var pciAddrRegexp = regexp.MustCompile(`\b([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}.\d{1}\S*)`) +func (csh cniServerHandler) configureDpdkNic(podName, podNamespace, provider, netns, containerID, ifName, mac string, mtu int, ip, gateway, ingress, egress, priority, sharedDir, socketName string) error { + hostNicName, _ := generateNicName(containerID, ifName) + + ipStr := util.GetIpWithoutMask(ip) + ifaceID := ovs.PodNameToPortName(podName, podNamespace, provider) + ovs.CleanDuplicatePort(ifaceID) + // Add veth pair host end to ovs port + output, err := ovs.Exec(ovs.MayExist, "add-port", "br-int", hostNicName, "--", + "set", "interface", hostNicName, + "type=dpdkvhostuserclient", + fmt.Sprintf("options:vhost-server-path=%s", path.Join(sharedDir, socketName)), + fmt.Sprintf("external_ids:iface-id=%s", ifaceID), + fmt.Sprintf("external_ids:pod_name=%s", podName), + fmt.Sprintf("external_ids:pod_namespace=%s", podNamespace), + fmt.Sprintf("external_ids:ip=%s", ipStr), + fmt.Sprintf("external_ids:pod_netns=%s", netns)) + if err != nil { + return fmt.Errorf("add nic to ovs failed %v: %q", err, output) + } + if err = ovs.SetInterfaceBandwidth(podName, podNamespace, ifaceID, egress, ingress, priority); err != nil { + return err + } + return nil +} + func (csh cniServerHandler) configureNic(podName, podNamespace, provider, netns, containerID, vfDriver, ifName, mac string, mtu int, ip, gateway string, isDefaultRoute bool, routes []request.Route, ingress, egress, priority, DeviceID, nicType string, gwCheckMode int) error { var err error var hostNicName, containerNicName string @@ -75,6 +102,17 @@ func (csh cniServerHandler) configureNic(podName, podNamespace, provider, netns, if containerNicName == "" { return nil } + isUserspaceDP, err := ovs.IsUserspaceDataPath() + if err != nil { + return err + } + if isUserspaceDP { + // turn off tx checksum + if err = turnOffNicTxChecksum(containerNicName); err != nil { + return err + } + } + podNS, err := ns.GetNS(netns) if err != nil { return fmt.Errorf("failed to open netns %q: %v", netns, err) @@ -1019,3 +1057,15 @@ func setVfMac(deviceID string, vfIndex int, mac string) error { } return nil } + +func turnOffNicTxChecksum(nicName string) (err error) { + start := time.Now() + args := []string{"-K", nicName, "tx", "off"} + output, err := exec.Command("ethtool", args...).CombinedOutput() + elapsed := float64((time.Since(start)) / time.Millisecond) + klog.V(4).Infof("command %s %s in %vms", "ethtool", strings.Join(args, " "), elapsed) + if err != nil { + return fmt.Errorf("failed to turn off nic tx checksum, output %s, err %s", string(output), err.Error()) + } + return nil +} diff --git a/pkg/ovs/ovs-vsctl.go b/pkg/ovs/ovs-vsctl.go index 7827d8fe582..7975dcf1f07 100644 --- a/pkg/ovs/ovs-vsctl.go +++ b/pkg/ovs/ovs-vsctl.go @@ -305,17 +305,17 @@ func ValidatePortVendor(port string) (bool, error) { return util.ContainsString(output, port), err } -//config mirror for interface by pod annotations and install param +// config mirror for interface by pod annotations and install param func ConfigInterfaceMirror(globalMirror bool, open string, iface string) error { if !globalMirror { - //find interface name for port + // find interface name for port interfaceList, err := ovsFind("interface", "name", fmt.Sprintf("external-ids:iface-id=%s", iface)) if err != nil { return err } for _, ifName := range interfaceList { - //ifName example: xxx_h - //find port uuid by interface name + // ifName example: xxx_h + // find port uuid by interface name portUUIDs, err := ovsFind("port", "_uuid", fmt.Sprintf("name=%s", ifName)) if err != nil { return err @@ -325,7 +325,7 @@ func ConfigInterfaceMirror(globalMirror bool, open string, iface string) error { } portId := portUUIDs[0] if open == "true" { - //add port to mirror + // add port to mirror err = ovsAdd("mirror", util.MirrorDefaultName, "select_dst_port", portId) if err != nil { return err @@ -343,7 +343,7 @@ func ConfigInterfaceMirror(globalMirror bool, open string, iface string) error { } for _, mirrorPortIds := range mirrorPorts { if strings.Contains(mirrorPortIds, portId) { - //remove port from mirror + // remove port from mirror _, err := Exec("remove", "mirror", util.MirrorDefaultName, "select_dst_port", portId) if err != nil { return err @@ -736,3 +736,11 @@ func ListQosQueueIds() (map[string]string, error) { } return result, nil } + +func IsUserspaceDataPath() (is bool, err error) { + dp, err := ovsFind("bridge", "datapath_type", "name=br-int") + if err != nil { + return false, err + } + return len(dp) > 0 && dp[0] == "netdev", nil +} diff --git a/pkg/request/cniserver.go b/pkg/request/cniserver.go index 06ddba32a53..59cc9a04ead 100644 --- a/pkg/request/cniserver.go +++ b/pkg/request/cniserver.go @@ -33,6 +33,10 @@ type CniRequest struct { VfDriver string `json:"vf_driver"` // PciAddrs in case of using sriov DeviceID string `json:"deviceID"` + // dpdk + // empty dir volume for sharing vhost user unix socket + VhostUserSocketVolumeName string `json:"vhost_user_socket_volume_name"` + VhostUserSocketName string `json:"vhost_user_socket_name"` } // CniResponse is the cniserver response format diff --git a/pkg/util/const.go b/pkg/util/const.go index efef1394228..61706cda7e0 100644 --- a/pkg/util/const.go +++ b/pkg/util/const.go @@ -71,6 +71,8 @@ const ( TunnelInterfaceAnnotation = "ovn.kubernetes.io/tunnel_interface" + OvsDpTypeLabel = "ovn.kubernetes.io/ovs_dp_type" + SubnetNameLabel = "ovn.kubernetes.io/subnet" ICGatewayLabel = "ovn.kubernetes.io/ic-gw" ExGatewayLabel = "ovn.kubernetes.io/external-gw" @@ -128,6 +130,9 @@ const ( OffloadType = "offload-port" InternalType = "internal-port" + DpdkType = "dpdk-port" + + DefaultHostVhostuserBaseDir = "/run/openvswitch/vhost_sockets" ChassisLoc = "/etc/openvswitch/system-id.conf" HostnameEnv = "KUBE_NODE_NAME" From bae63cba38f328063315cb0bb37f4a6e84a396e9 Mon Sep 17 00:00:00 2001 From: luoyunhe Date: Fri, 11 Mar 2022 15:57:06 +0800 Subject: [PATCH 2/3] fix the doc and some code --- docs/dpdk-hybrid.md | 14 +++++++------- pkg/daemon/handler.go | 16 ++++++---------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/dpdk-hybrid.md b/docs/dpdk-hybrid.md index 7337cabb486..c275c9cb8de 100644 --- a/docs/dpdk-hybrid.md +++ b/docs/dpdk-hybrid.md @@ -3,22 +3,22 @@ This document describes how to run Kube-OVN with nodes which run ovs-dpdk or ovs-kernel ## Prerequisite -Node which run ovs-dpdk must have a net card bound to the dpdk driver. -Hugepages on the host. -## Label nodes used run ovs-dpdk +* Node which runs ovs-dpdk must have a net card bound to the dpdk driver. +* Hugepages on the host. +## Label nodes that need to run ovs-dpdk ```bash kubectl label nodes ovn.kubernetes.io/ovs_dp_type="userspace" ``` ## Set up net card -We use driverctl to persist the device driver configuration. +We use `driverctl` to persist the device driver configuration. Here is an example to bind dpdk driver to a net card. ```bash driverctl set-override 0000:00:0b.0 uio_pci_generic ``` -For other drivers, please refer to https://www.dpdk.org/ +For other drivers, please refer to https://www.dpdk.org/. ## configrue node -Edit the configuration file named ovs-dpdk-config on the node that needs to run ovs-dpdk. The configuration file needs to be placed in the /opt/ovs-config directory. +Edit the configuration file named `ovs-dpdk-config` on the node that needs to run ovs-dpdk. The configuration file needs to be placed in the `/opt/ovs-config` directory. ```bash # specify encap IP ENCAP_IP=192.168.122.193/24 @@ -28,7 +28,7 @@ DPDK_DEV=0000:00:0b.0 ## Set up Kube-OVN -Just run install.sh --with-hybrid-dpdk +Just run `install.sh --with-hybrid-dpdk` ## How to use Here is an example to create a vhost-user app to use userspace datapath. We create a virtual machine using vhostuser and test if it can access the Internet. diff --git a/pkg/daemon/handler.go b/pkg/daemon/handler.go index 6cad40e9e44..2d40f6b0adf 100644 --- a/pkg/daemon/handler.go +++ b/pkg/daemon/handler.go @@ -125,8 +125,7 @@ func (csh cniServerHandler) handleAdd(req *restful.Request, resp *restful.Respon nicType = util.OffloadType } else if podRequest.VhostUserSocketVolumeName != "" { nicType = util.DpdkType - err = createShortSharedDir(pod, podRequest.VhostUserSocketVolumeName) - if err != nil { + if err = createShortSharedDir(pod, podRequest.VhostUserSocketVolumeName); err != nil { klog.Error(err.Error()) if err = resp.WriteHeaderAndEntity(http.StatusInternalServerError, request.CniResponse{Err: err.Error()}); err != nil { klog.Errorf("failed to write response: %v", err) @@ -267,15 +266,14 @@ func createShortSharedDir(pod *v1.Pod, volumeName string) (err error) { } } if volume == nil { - return fmt.Errorf("can not fount volume %s in pod %s", volumeName, pod.Name) + return fmt.Errorf("can not found volume %s in pod %s", volumeName, pod.Name) } if volume.EmptyDir == nil { return fmt.Errorf("volume %s is not empty dir", volume.Name) } originSharedDir := fmt.Sprintf("/var/lib/kubelet/pods/%s/volumes/kubernetes.io~empty-dir/%s", pod.UID, volumeName) newSharedDir := getShortSharedDir(pod.UID, volumeName) - _, err = os.Stat(newSharedDir) - if os.IsNotExist(err) { + if _, err = os.Stat(newSharedDir); os.IsNotExist(err) { err = os.MkdirAll(newSharedDir, 0750) if err != nil { return fmt.Errorf("createSharedDir: Failed to create dir (%s): %v", newSharedDir, err) @@ -297,9 +295,8 @@ func createShortSharedDir(pod *v1.Pod, volumeName string) (err error) { func removeShortSharedDir(pod *v1.Pod, volumeName string) (err error) { sharedDir := getShortSharedDir(pod.UID, volumeName) - _, err = os.Stat(sharedDir) - if os.IsNotExist(err) { - klog.Errorf("shared directory %s does not exist to unmount", sharedDir) + if _, err = os.Stat(sharedDir); os.IsNotExist(err) { + klog.Errorf("shared directory %s does not exist to unmount, %s", sharedDir, err) return nil } err = unix.Unmount(sharedDir, 0) @@ -428,8 +425,7 @@ func (csh cniServerHandler) handleDel(req *restful.Request, resp *restful.Respon nicType = util.OffloadType } else if podRequest.VhostUserSocketVolumeName != "" { nicType = util.DpdkType - err = removeShortSharedDir(pod, podRequest.VhostUserSocketVolumeName) - if err != nil { + if err = removeShortSharedDir(pod, podRequest.VhostUserSocketVolumeName); err != nil { klog.Error(err.Error()) if err = resp.WriteHeaderAndEntity(http.StatusInternalServerError, request.CniResponse{Err: err.Error()}); err != nil { klog.Errorf("failed to write response: %v", err) From ac92efc7550904b4ac637336f891428ad9def61b Mon Sep 17 00:00:00 2001 From: luoyunhe Date: Fri, 11 Mar 2022 16:47:14 +0800 Subject: [PATCH 3/3] Support customized dpdk tunnel network card name --- dist/images/install.sh | 5 +++++ dist/images/start-ovs-dpdk-v2.sh | 18 ++++++++++-------- pkg/daemon/config.go | 5 ++++- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/dist/images/install.sh b/dist/images/install.sh index 3a95592bf11..009824a8e84 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -16,6 +16,8 @@ ENABLE_EXTERNAL_VPC=${ENABLE_EXTERNAL_VPC:-true} # The nic to support container network can be a nic name or a group of regex # separated by comma, if empty will use the nic that the default route use IFACE=${IFACE:-} +# Specifies the name of the dpdk tunnel iface. +DPDK_TUNNEL_IFACE=${DPDK_TUNNEL_IFACE:-br-phy} CNI_CONF_DIR="/etc/cni/net.d" CNI_BIN_DIR="/opt/cni/bin" @@ -1882,6 +1884,8 @@ spec: value: "$HW_OFFLOAD" - name: TUNNEL_TYPE value: "$TUNNEL_TYPE" + - name: DPDK_TUNNEL_IFACE + value: "$DPDK_TUNNEL_IFACE" - name: KUBE_NODE_NAME valueFrom: fieldRef: @@ -2183,6 +2187,7 @@ spec: - --encap-checksum=true - --service-cluster-ip-range=$SVC_CIDR - --iface=${IFACE} + - --dpdk-tunnel-iface=${DPDK_TUNNEL_IFACE} - --network-type=$NETWORK_TYPE - --default-interface-name=$VLAN_INTERFACE_NAME - --logtostderr=false diff --git a/dist/images/start-ovs-dpdk-v2.sh b/dist/images/start-ovs-dpdk-v2.sh index a5f3f67aa23..b7e3946c733 100755 --- a/dist/images/start-ovs-dpdk-v2.sh +++ b/dist/images/start-ovs-dpdk-v2.sh @@ -2,6 +2,8 @@ set -euo pipefail +DPDK_TUNNEL_IFACE=${DPDK_TUNNEL_IFACE:-br-phy} + OVS_DPDK_CONFIG_FILE=/opt/ovs-config/ovs-dpdk-config if ! test -f "$OVS_DPDK_CONFIG_FILE"; then echo "missing ovs dpdk config" @@ -54,19 +56,19 @@ ovs-ctl --protocol=udp --dport=6081 enable-protocol -if ! ovs-vsctl br-exists br-phy; then -ovs-vsctl --may-exist add-br br-phy \ - -- set Bridge br-phy datapath_type=netdev \ - -- br-set-external-id br-phy bridge-id br-phy \ - -- set bridge br-phy fail-mode=standalone +if ! ovs-vsctl br-exists ${DPDK_TUNNEL_IFACE}; then +ovs-vsctl --may-exist add-br ${DPDK_TUNNEL_IFACE} \ + -- set Bridge ${DPDK_TUNNEL_IFACE} datapath_type=netdev \ + -- br-set-external-id ${DPDK_TUNNEL_IFACE} bridge-id ${DPDK_TUNNEL_IFACE} \ + -- set bridge ${DPDK_TUNNEL_IFACE} fail-mode=standalone -ovs-vsctl --timeout 10 add-port br-phy dpdk0 \ +ovs-vsctl --timeout 10 add-port ${DPDK_TUNNEL_IFACE} dpdk0 \ -- set Interface dpdk0 type=dpdk options:dpdk-devargs=${DPDK_DEV} -ip addr add ${ENCAP_IP} dev br-phy +ip addr add ${ENCAP_IP} dev ${DPDK_TUNNEL_IFACE} fi -ip link set br-phy up +ip link set ${DPDK_TUNNEL_IFACE} up ovs-vsctl --may-exist add-br br-int \ -- set Bridge br-int datapath_type=netdev \ diff --git a/pkg/daemon/config.go b/pkg/daemon/config.go index ff4209eabe5..6e70fcbfb13 100644 --- a/pkg/daemon/config.go +++ b/pkg/daemon/config.go @@ -27,6 +27,7 @@ import ( // Configuration is the daemon conf type Configuration struct { Iface string + DPDKTunnelIface string MTU int MSS int EnableMirror bool @@ -52,6 +53,7 @@ type Configuration struct { func ParseFlags(nicBridgeMappings map[string]string) (*Configuration, error) { var ( argIface = pflag.String("iface", "", "The iface used to inter-host pod communication, can be a nic name or a group of regex separated by comma (default the default route iface)") + argDPDKTunnelIface = pflag.String("dpdk-tunnel-iface", "br-phy", "Specifies the name of the dpdk tunnel iface.") argMTU = pflag.Int("mtu", 0, "The MTU used by pod iface in overlay networks (default iface MTU - 100)") argEnableMirror = pflag.Bool("enable-mirror", false, "Enable traffic mirror (default false)") argMirrorNic = pflag.String("mirror-iface", "mirror0", "The mirror nic name that will be created by kube-ovn") @@ -97,6 +99,7 @@ func ParseFlags(nicBridgeMappings map[string]string) (*Configuration, error) { } config := &Configuration{ Iface: *argIface, + DPDKTunnelIface: *argDPDKTunnelIface, MTU: *argMTU, EnableMirror: *argEnableMirror, MirrorNic: *argMirrorNic, @@ -147,7 +150,7 @@ func (config *Configuration) initNicConfig(nicBridgeMappings map[string]string) isDPDKNode := node.GetLabels()[util.OvsDpTypeLabel] == "userspace" if isDPDKNode { - config.Iface = "br-phy" + config.Iface = config.DPDKTunnelIface } if config.Iface == "" { podIP, ok := os.LookupEnv("POD_IP")