k3d: Introduce k3d SR-IOV provider

Since kubernetes-sigs/kind#2999 blocks us from updating to newer k8s versions using kind, we are introducing k3d. Changes: * Support of local multi instances was removed, we are not using it, and it shouldn't affect multi instances on CI once we want to introduce it. * Added gracefully releasing of the SR-IOV nics. It reduces the downtime between cluster-down and cluster-up nicely, as the nics disappear for few minutes otherwise. * Only one PF per node is supported, we don't need more for now. * Use the k3d local registry instead one of our own. * The provider is hardcoded with 1 server (master node) and 2 agents (workers). If we will need other configuration it can be done on follow PR, for now there is no reason to support other config. Signed-off-by: Or Shoval <[email protected]>
kubevirt · Mar 6, 2023 · e9b40c3 · e9b40c3
1 parent a4f8e8a
commit e9b40c3
Show file tree

Hide file tree

Showing 15 changed files with 3,999 additions and 625 deletions.
diff --git a/cluster-up/cluster/k3d-1.25-sriov/README.md b/cluster-up/cluster/k3d-1.25-sriov/README.md
@@ -1,101 +1,82 @@
-# K8S 1.23.13 with SR-IOV in a Kind cluster
+# K8s 1.25.x with SR-IOV in a K3d cluster
 
-Provides a pre-deployed containerized k8s cluster with version 1.23.13 that runs
-using [KinD](https://github.com/kubernetes-sigs/kind)
+Provides a pre-deployed containerized k8s cluster with version 1.25.x that runs
+using [K3d](https://github.com/k3d-io/k3d)
 The cluster is completely ephemeral and is recreated on every cluster restart. The KubeVirt containers are built on the
 local machine and are then pushed to a registry which is exposed at
-`localhost:5000`.
+`127.0.0.1:5000`.
 
-This version also expects to have SR-IOV enabled nics (SR-IOV Physical Function) on the current host, and will move
-physical interfaces into the `KinD`'s cluster worker node(s) so that they can be used through multus and SR-IOV
+This version requires to have SR-IOV enabled nics (SR-IOV Physical Function) on the current host, and will move
+physical interfaces into the `K3d`'s cluster agent node(s) (agent node is a worker node on k3d terminology)
+so that they can be used through multus and SR-IOV
 components.
 
-This providers also deploys [multus](https://github.com/k8snetworkplumbingwg/multus-cni)
+This provider also deploys [multus](https://github.com/k8snetworkplumbingwg/multus-cni)
 , [sriov-cni](https://github.com/k8snetworkplumbingwg/sriov-cni)
 and [sriov-device-plugin](https://github.com/k8snetworkplumbingwg/sriov-network-device-plugin).
 
 ## Bringing the cluster up
 
 ```bash
-export KUBEVIRT_PROVIDER=kind-1.23-sriov
-export KUBEVIRT_NUM_NODES=3
+export KUBEVIRT_PROVIDER=k3d-1.25-sriov
+export KUBECONFIG=$(realpath _ci-configs/k3d-1.25-sriov/.kubeconfig)
 make cluster-up
-
-$ cluster-up/kubectl.sh get nodes
-NAME                  STATUS   ROLES                  AGE   VERSION
-sriov-control-plane   Ready    control-plane,master   20h   v1.23.13
-sriov-worker          Ready    worker                 20h   v1.23.13
-sriov-worker2         Ready    worker                 20h   v1.23.13
-
-$ cluster-up/kubectl.sh get pods -n kube-system -l app=multus
-NAME                         READY   STATUS    RESTARTS   AGE
-kube-multus-ds-amd64-d45n4   1/1     Running   0          20h
-kube-multus-ds-amd64-g26xh   1/1     Running   0          20h
-kube-multus-ds-amd64-mfh7c   1/1     Running   0          20h
-
-$ cluster-up/kubectl.sh get pods -n sriov -l app=sriov-cni
+```
+```
+$ kubectl get nodes
+NAME                 STATUS   ROLES                  AGE   VERSION
+k3d-sriov-server-0   Ready    control-plane,master   67m   v1.25.6+k3s1
+k3d-sriov-agent-0    Ready    worker                 67m   v1.25.6+k3s1
+k3d-sriov-agent-1    Ready    worker                 67m   v1.25.6+k3s1
+
+$ kubectl get pods -n kube-system -l app=multus
+NAME                   READY   STATUS    RESTARTS   AGE
+kube-multus-ds-z9hvs   1/1     Running   0          66m
+kube-multus-ds-7shgv   1/1     Running   0          66m
+kube-multus-ds-l49xj   1/1     Running   0          66m
+
+$ kubectl get pods -n sriov -l app=sriov-cni
 NAME                            READY   STATUS    RESTARTS   AGE
-kube-sriov-cni-ds-amd64-fv5cr   1/1     Running   0          20h
-kube-sriov-cni-ds-amd64-q95q9   1/1     Running   0          20h
+kube-sriov-cni-ds-amd64-4pndd   1/1     Running   0          66m
+kube-sriov-cni-ds-amd64-68nhh   1/1     Running   0          65m
 
-$ cluster-up/kubectl.sh get pods -n sriov -l app=sriovdp
+$ kubectl get pods -n sriov -l app=sriovdp
 NAME                                   READY   STATUS    RESTARTS   AGE
-kube-sriov-device-plugin-amd64-h7h84   1/1     Running   0          20h
-kube-sriov-device-plugin-amd64-xrr5z   1/1     Running   0          20h
+kube-sriov-device-plugin-amd64-qk66v   1/1     Running   0          66m
+kube-sriov-device-plugin-amd64-d5r5b   1/1     Running   0          65m
+```
+
+### Conneting to a node
+```bash
+export KUBEVIRT_PROVIDER=k3d-1.25-sriov
+./cluster-up/ssh.sh <node_name> /bin/sh
 ```
 
 ## Bringing the cluster down
 
 ```bash
-export KUBEVIRT_PROVIDER=kind-1.23-sriov
+export KUBEVIRT_PROVIDER=k3d-1.25-sriov
 make cluster-down
 ```
 
-This destroys the whole cluster, and moves the SR-IOV nics to the root network namespace.
+This destroys the whole cluster, and gracefully moves the SR-IOV nics to the root network namespace.
 
-## Setting a custom kind version
+Note: killing the containers / cluster without gracefully moving the nics to the root ns before it,
+might result in unreachable nics for few minutes.
+`find /sys/class/net/*/device/sriov_numvfs` can be used to see when the nics are reachable again.
 
-In order to use a custom kind image / kind version, export `KIND_NODE_IMAGE`, `KIND_VERSION`, `KUBECTL_PATH` before
-running cluster-up. For example in order to use kind 0.9.0 (which is based on k8s-1.19.1) use:
+## Using podman
+Podman v4 is required.
 
+Run:
 ```bash
-export KIND_NODE_IMAGE="kindest/node:v1.19.1@sha256:98cf5288864662e37115e362b23e4369c8c4a408f99cbc06e58ac30ddc721600"
-export KIND_VERSION="0.9.0"
-export KUBECTL_PATH="/usr/bin/kubectl"
+systemctl enable --now podman.socket
+ln -s /run/podman/podman.sock /var/run/docker.sock
 ```
+The rest is as usual.
+For more info see https://k3d.io/v5.4.1/usage/advanced/podman.
 
-This allows users to test or use custom images / different kind versions before making them official.
-See https://github.com/kubernetes-sigs/kind/releases for details about node images according to the kind version.
-
-## Running multi SR-IOV clusters locally
-
-Kubevirtci SR-IOV provider supports running two clusters side by side with few known limitations.
-
-General considerations:
-
-- A SR-IOV PF must be available for each cluster. In order to achieve that, there are two options:
-
-1. Assign just one PF for each worker node of each cluster by using `export PF_COUNT_PER_NODE=1` (this is the default
-   value).
-2. Optional method: `export PF_BLACKLIST=<PF names>` the non used PFs, in order to prevent them from being allocated to
-   the current cluster. The user can list the PFs that should not be allocated to the current cluster, keeping in mind
-   that at least one (or 2 in case of migration), should not be listed, so they would be allocated for the current
-   cluster. Note: another reason to blacklist a PF, is in case its has a defect or should be kept for other operations (
-   for example sniffing).
-
-- Clusters should be created one by another and not in parallel (to avoid races over SR-IOV PF's).
-- The cluster names must be different. This can be achieved by setting `export CLUSTER_NAME=sriov2` on the 2nd cluster.
-  The default `CLUSTER_NAME` is `sriov`. The 2nd cluster registry would be exposed at `localhost:5001` automatically,
-  once the `CLUSTER_NAME`
-  is set to a non default value.
-- Each cluster should be created on its own git clone folder, i.e:
-  `/root/project/kubevirtci1`
-  `/root/project/kubevirtci2`
-  In order to switch between them, change dir to that folder and set the env variables `KUBECONFIG`
-  and `KUBEVIRT_PROVIDER`.
-- In case only one PF exists, for example if running on prow which will assign only one PF per job in its own DinD,
-  Kubevirtci is agnostic and nothing needs to be done, since all conditions above are met.
-- Upper limit of the number of clusters that can be run on the same time equals number of PFs / number of PFs per
-  cluster, therefore, in case there is only one PF, only one cluster can be created. Locally the actual limit currently
-  supported is two clusters.
-- In order to use `make cluster-down` please make sure the right `CLUSTER_NAME` is exported.
+### Bumping calico
+Fetch new calico yaml and:
+1. Enable `allow_ip_forwarding` (See https://k3d.io/v5.0.1/usage/advanced/calico)
+2. Prefix the images in the yaml with `quay.io/`
diff --git a/cluster-up/cluster/k3d-1.25-sriov/TROUBLESHOOTING.md b/cluster-up/cluster/k3d-1.25-sriov/TROUBLESHOOTING.md
@@ -1,4 +1,4 @@
-# How to troubleshoot a failing kind job
+# How to troubleshoot a failing k3d job
 
 If logging and output artifacts are not enough, there is a way to connect to a running CI pod and troubleshoot directly from there.
 
@@ -16,14 +16,14 @@ Just `go get` it by running `go get k8s.io/test-infra/prow/cmd/mkpj`
 Then run the following command from a checkout of the [project-infra repo](https://github.com/kubevirt/project-infra):
 
 ```bash
-mkpj --pull-number $KUBEVIRTPRNUMBER -job pull-kubevirt-e2e-kind-k8s-sriov-1.17.0 -job-config-path github/ci/prow/files/jobs/kubevirt/kubevirt-presubmits.yaml --config-path github/ci/prow/files/config.yaml > debugkind.yaml
+mkpj --pull-number $KUBEVIRT_PR_NUMBER -job pull-kubevirt-e2e-k3d-1.25-sriov -job-config-path github/ci/prow/files/jobs/kubevirt/kubevirt-presubmits.yaml --config-path github/ci/prow/files/config.yaml > debugkind.yaml
 ```
 
 You will end up having a ProwJob manifest in the `debugkind.yaml` file.
 
 It's strongly recommended to replace the job's name, as it will be easier to find and debug the relative pod, by replacing `metadata.name` with something more recognizeable.
 
-The $KUBEVIRTPRNUMBER can be an actual PR on the [kubevirt repo](https://github.com/kubevirt/kubevirt).
+The `$KUBEVIRT_PR_NUMBER` can be an actual PR on the [kubevirt repo](https://github.com/kubevirt/kubevirt).
 
 In case we just want to debug the cluster provided by the CI, it's recommended to override the entry point, either in the test PR we are instrumenting (a good sample can be found [here](https://github.com/kubevirt/kubevirt/pull/3022)), or by overriding the entry point directly in the prow job's manifest.
 
@@ -32,29 +32,27 @@ Remember that we want the cluster long living, so a long sleep must be provided
 Make sure you switch to the `kubevirt-prow-jobs` project, and apply the manifest:
 
 ```bash
-    kubectl apply -f debugkind.yaml
+kubectl apply -f debugkind.yaml
 ```
 
 You will end up with a ProwJob object, and a pod with the same name you gave to the ProwJob.
 
 Once the pod is up & running, connect to it via bash:
 
 ```bash
-    kubectl exec -it debugprowjobpod bash
+kubectl exec -it debugprowjobpod bash
 ```
 
 ### Logistics
 
 Once you are in the pod, you'll be able to troubleshoot what's happening in the environment CI is running its tests.
 
-Run the follow to bring up a [kind](https://github.com/kubernetes-sigs/kind) cluster with a single node setup and the SR-IOV operator already setup to go (if it wasn't already done by the job itself).
+Run the follow to bring up a [k3d](https://github.com/k3d-io/k3d) cluster with SR-IOV installed.
 
 ```bash
-KUBEVIRT_PROVIDER=kind-k8s-sriov-1.17.0 make cluster-up
+KUBEVIRT_PROVIDER=k3d-1.25-sriov make cluster-up
 ```
 
-The kubeconfig file will be available under `/root/.kube/kind-config-sriov`.
-
+Use `k3d kubeconfig print sriov` to extract the kubeconfig file.
 The `kubectl` binary is already on board and in `$PATH`.
-
-The container acting as node is the one named `sriov-control-plane`. You can even see what's in there by running `docker exec -it sriov-control-plane bash`.
+See `README.md` for more info.
diff --git a/cluster-up/cluster/k3d-1.25-sriov/config_sriov_cluster.sh b/cluster-up/cluster/k3d-1.25-sriov/config_sriov_cluster.sh
@@ -4,8 +4,7 @@
 
 set -xe
 
-PF_COUNT_PER_NODE=${PF_COUNT_PER_NODE:-1}
-[ $PF_COUNT_PER_NODE -le 0 ] && echo "FATAL: PF_COUNT_PER_NODE must be a positive integer" >&2 && exit 1
+PF_COUNT_PER_NODE=1
 
 SCRIPT_PATH=$(dirname "$(realpath "$0")")
 

diff --git a/cluster-up/cluster/k3d-1.25-sriov/provider.sh b/cluster-up/cluster/k3d-1.25-sriov/provider.sh
@@ -2,68 +2,43 @@
 
 set -e
 
-DEFAULT_CLUSTER_NAME="sriov"
-DEFAULT_HOST_PORT=5000
-ALTERNATE_HOST_PORT=5001
-export CLUSTER_NAME=${CLUSTER_NAME:-$DEFAULT_CLUSTER_NAME}
+export CLUSTER_NAME="sriov"
+export HOST_PORT=5000
 
-if [ $CLUSTER_NAME == $DEFAULT_CLUSTER_NAME ]; then
-    export HOST_PORT=$DEFAULT_HOST_PORT
-else
-    export HOST_PORT=$ALTERNATE_HOST_PORT
-fi
+DEPLOY_SRIOV=${DEPLOY_SRIOV:-true}
 
-function set_kind_params() {
-    export KIND_VERSION="${KIND_VERSION:-0.17.0}"
-    export KIND_NODE_IMAGE="${KIND_NODE_IMAGE:-quay.io/kubevirtci/kindest-node:v1.23.13@sha256:ef453bb7c79f0e3caba88d2067d4196f427794086a7d0df8df4f019d5e336b61}"
-    export KUBECTL_PATH="${KUBECTL_PATH:-/bin/kubectl}"
+function print_available_nics() {
+    echo 'STEP: Available NICs'
+    # print hardware info for easier debugging based on logs
+    ${CRI_BIN} run --rm --cap-add=SYS_RAWIO quay.io/phoracek/lspci@sha256:0f3cacf7098202ef284308c64e3fc0ba441871a846022bb87d65ff130c79adb1 sh -c "lspci | egrep -i 'network|ethernet'"
+    echo
 }
 
-function print_sriov_data() {
-    nodes=$(_kubectl get nodes -o=custom-columns=:.metadata.name | awk NF)
+function print_agents_sriov_status() {
+    nodes=$(_get_agent_nodes)
+    echo "STEP: Print agents SR-IOV status"
     for node in $nodes; do
-        if [[ ! "$node" =~ .*"control-plane".* ]]; then
-            echo "Node: $node"
-            echo "VFs:"
-            ${CRI_BIN} exec $node bash -c "ls -l /sys/class/net/*/device/virtfn*"
-            echo "PFs PCI Addresses:"
-            ${CRI_BIN} exec $node bash -c "grep PCI_SLOT_NAME /sys/class/net/*/device/uevent"
-        fi
+        echo "Node: $node"
+        echo "VFs:"
+        ${CRI_BIN} exec $node /bin/sh -c "ls -l /sys/class/net/*/device/virtfn*"
+        echo "PFs PCI Addresses:"
+        ${CRI_BIN} exec $node /bin/sh -c "grep PCI_SLOT_NAME /sys/class/net/*/device/uevent"
     done
+    echo
 }
 
-function configure_registry_proxy() {
-    [ "$CI" != "true" ] && return
-
-    echo "Configuring cluster nodes to work with CI mirror-proxy..."
-
-    local -r ci_proxy_hostname="docker-mirror-proxy.kubevirt-prow.svc"
-    local -r kind_binary_path="${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kind"
-    local -r configure_registry_proxy_script="${KUBEVIRTCI_PATH}/cluster/kind/configure-registry-proxy.sh"
-
-    KIND_BIN="$kind_binary_path" PROXY_HOSTNAME="$ci_proxy_hostname" $configure_registry_proxy_script
+function deploy_sriov() {
+    print_available_nics
+    ${KUBEVIRTCI_PATH}/cluster/$KUBEVIRT_PROVIDER/config_sriov_cluster.sh
+    print_agents_sriov_status
 }
 
 function up() {
-    # print hardware info for easier debugging based on logs
-    echo 'Available NICs'
-    ${CRI_BIN} run --rm --cap-add=SYS_RAWIO quay.io/phoracek/lspci@sha256:0f3cacf7098202ef284308c64e3fc0ba441871a846022bb87d65ff130c79adb1 sh -c "lspci | egrep -i 'network|ethernet'"
-    echo ""
-
-    cp $KIND_MANIFESTS_DIR/kind.yaml ${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/kind.yaml
-    kind_up
-
-    configure_registry_proxy
-
-    # remove the rancher.io kind default storageClass
-    _kubectl delete sc standard
+    k3d_up
+    [ $DEPLOY_SRIOV == true ] && deploy_sriov
 
-    ${KUBEVIRTCI_PATH}/cluster/$KUBEVIRT_PROVIDER/config_sriov_cluster.sh
-
-    print_sriov_data
-    echo "$KUBEVIRT_PROVIDER cluster '$CLUSTER_NAME' is ready"
+    version=$(_kubectl get node k3d-$CLUSTER_NAME-server-0 -o=custom-columns=VERSION:.status.nodeInfo.kubeletVersion --no-headers)
+    echo "$KUBEVIRT_PROVIDER cluster '$CLUSTER_NAME' is ready ($version)"
 }
 
-set_kind_params
-
-source ${KUBEVIRTCI_PATH}/cluster/kind/common.sh
+source ${KUBEVIRTCI_PATH}/cluster/k3d/common.sh
diff --git a/cluster-up/cluster/k3d-1.25-sriov/sriov-components/sriov_components.sh b/cluster-up/cluster/k3d-1.25-sriov/sriov-components/sriov_components.sh
@@ -18,10 +18,10 @@ PATCH_NODE_SELECTOR_TEMPLATE="${MANIFESTS_DIR}/patch-node-selector.yaml.in"
 PATCH_NODE_SELECTOR="${CUSTOM_MANIFESTS}/patch-node-selector.yaml"
 
 KUBECONFIG="${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kubeconfig"
-KUBECTL="${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kubectl --kubeconfig=${KUBECONFIG}"
 
 function _kubectl() {
-    ${KUBECTL} "$@"
+    export KUBECONFIG=${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kubeconfig
+    ${KUBEVIRTCI_CONFIG_PATH}/$KUBEVIRT_PROVIDER/.kubectl --kubeconfig=$KUBECONFIG "$@"
 }
 
 function _retry() {
@@ -63,7 +63,7 @@ function _check_all_pods_ready() {
 # not using kubectl wait since with the sriov operator the pods get restarted a couple of times and this is
 # more reliable
 function sriov_components::wait_pods_ready() {
-  local -r tries=30
+  local -r tries=60
   local -r wait_time=10
 
   local -r wait_message="Waiting for all pods to become ready.."
@@ -126,7 +126,7 @@ function sriov_components::deploy() {
   local -r label_key=$5
   local -r label_value=$6
 
-  _create_custom_manifests_dir 
+  _create_custom_manifests_dir
   _prepare_node_selector_patch "$label_key" "$label_value"
   _prepare_sriovdp_resource_prefix_patch "$resource_prefix"
   _prepare_device_plugin_config \