From 4b94a1f247074cfce380fbb053aae56af6084528 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Tue, 9 Apr 2024 10:17:51 +0200 Subject: [PATCH 1/6] e2e: `Debug logging should be visible in multus pod` flake Use `Pod.Status.StartTime` field to infer the timestamp lower bound of interested log lines. This field is populated by the kubelet, so it uses the same clock as the multus pod logs. Prior to this commit, `Meta.CreationTimestamp` field was used. That field is populated by the API server, which can have a different clock than the kubelet, as they are likely to run on different nodes. Following example comes from a CI run: ``` { "metadata": { "name": "testpod-68rc4", "generateName": "testpod-", "namespace": "sriov-conformance-testing", "creationTimestamp": "2024-04-08T10:44:45Z", ... "status": { ... "startTime": "2024-04-08T10:44:44Z", } ``` Signed-off-by: Andrea Panattoni --- test/conformance/tests/test_sriov_operator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 9ad400cbc..55711b78f 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -1015,7 +1015,7 @@ var _ = Describe("[sriov] operator", func() { testPod := createTestPod(node, []string{sriovNetworkName}) - recentMultusLogs := getMultusPodLogs(testPod.Spec.NodeName, testPod.ObjectMeta.CreationTimestamp.Time) + recentMultusLogs := getMultusPodLogs(testPod.Spec.NodeName, testPod.Status.StartTime.Time) Expect(recentMultusLogs).To( ContainElement( From 04dbdf58e984275a4bc498b3420478b313cdeb29 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Apr 2024 15:21:47 +0300 Subject: [PATCH 2/6] block overlap policies using the same rootDevice Signed-off-by: Sebastian Sch --- api/v1/helper.go | 32 +++++++++---- pkg/webhook/validate.go | 58 +++++++++++++++++------- pkg/webhook/validate_test.go | 25 ++++++++++ test/e2e/sriovoperatornodepolicy_test.go | 2 +- 4 files changed, 90 insertions(+), 27 deletions(-) diff --git a/api/v1/helper.go b/api/v1/helper.go index 462afa356..8b1cf9c96 100644 --- a/api/v1/helper.go +++ b/api/v1/helper.go @@ -406,7 +406,7 @@ func (p *SriovNetworkNodePolicy) Apply(state *SriovNetworkNodeState, equalPriori ExternallyManaged: p.Spec.ExternallyManaged, } if p.Spec.NumVfs > 0 { - group, err := p.generateVfGroup(&iface) + group, err := p.generatePfNameVfGroup(&iface) if err != nil { return err } @@ -475,13 +475,13 @@ func (gr VfGroup) isVFRangeOverlapping(group VfGroup) bool { return IndexInRange(rngSt, group.VfRange) || IndexInRange(rngEnd, group.VfRange) } -func (p *SriovNetworkNodePolicy) generateVfGroup(iface *InterfaceExt) (*VfGroup, error) { +func (p *SriovNetworkNodePolicy) generatePfNameVfGroup(iface *InterfaceExt) (*VfGroup, error) { var err error pfName := "" var rngStart, rngEnd int found := false for _, selector := range p.Spec.NicSelector.PfNames { - pfName, rngStart, rngEnd, err = ParsePFName(selector) + pfName, rngStart, rngEnd, err = ParseVfRange(selector) if err != nil { log.Error(err, "Unable to parse PF Name.") return nil, err @@ -534,15 +534,27 @@ func parseRange(r string) (rngSt, rngEnd int, err error) { return } -// Parse PF name with VF range -func ParsePFName(name string) (ifName string, rngSt, rngEnd int, err error) { +// SplitDeviceFromRange return the device name and the range. +// the split is base on # +func SplitDeviceFromRange(device string) (string, string) { + if strings.Contains(device, "#") { + fields := strings.Split(device, "#") + return fields[0], fields[1] + } + + return device, "" +} + +// ParseVfRange: parse a device with VF range +// this can be rootDevices or PFName +// if no range detect we just return the device name +func ParseVfRange(device string) (rootDeviceName string, rngSt, rngEnd int, err error) { rngSt, rngEnd = invalidVfIndex, invalidVfIndex - if strings.Contains(name, "#") { - fields := strings.Split(name, "#") - ifName = fields[0] - rngSt, rngEnd, err = parseRange(fields[1]) + rootDeviceName, splitRange := SplitDeviceFromRange(device) + if splitRange != "" { + rngSt, rngEnd, err = parseRange(splitRange) } else { - ifName = name + rootDeviceName = device } return } diff --git a/pkg/webhook/validate.go b/pkg/webhook/validate.go index 07ea8ea82..a4cca97c0 100644 --- a/pkg/webhook/validate.go +++ b/pkg/webhook/validate.go @@ -367,6 +367,11 @@ func validatePolicyForNodePolicy(current *sriovnetworkv1.SriovNetworkNodePolicy, return err } + err = validateRootDevices(current, previous) + if err != nil { + return err + } + err = validateExludeTopologyField(current, previous) if err != nil { return err @@ -377,28 +382,18 @@ func validatePolicyForNodePolicy(current *sriovnetworkv1.SriovNetworkNodePolicy, func validatePfNames(current *sriovnetworkv1.SriovNetworkNodePolicy, previous *sriovnetworkv1.SriovNetworkNodePolicy) error { for _, curPf := range current.Spec.NicSelector.PfNames { - curName, curRngSt, curRngEnd, err := sriovnetworkv1.ParsePFName(curPf) + curName, curRngSt, curRngEnd, err := sriovnetworkv1.ParseVfRange(curPf) if err != nil { return fmt.Errorf("invalid PF name: %s", curPf) } for _, prePf := range previous.Spec.NicSelector.PfNames { - // Not validate return err of ParsePFName for previous PF + // Not validate return err for previous PF // since it should already be evaluated in previous run. - preName, preRngSt, preRngEnd, _ := sriovnetworkv1.ParsePFName(prePf) + preName, preRngSt, preRngEnd, _ := sriovnetworkv1.ParseVfRange(prePf) if curName == preName { - // reject policy with externallyManage if there is a policy on the same PF without it - if current.Spec.ExternallyManaged != previous.Spec.ExternallyManaged { - return fmt.Errorf("externallyManage is inconsistent with existing policy %s", previous.GetName()) - } - - // reject policy with externallyManage if there is a policy on the same PF with switch dev - if current.Spec.ExternallyManaged && previous.Spec.EswitchMode == sriovnetworkv1.ESwithModeSwitchDev { - return fmt.Errorf("externallyManage overlap with switchdev mode in existing policy %s", previous.GetName()) - } - - // reject policy with externallyManage if there is a policy on the same PF with switch dev - if previous.Spec.ExternallyManaged && current.Spec.EswitchMode == sriovnetworkv1.ESwithModeSwitchDev { - return fmt.Errorf("switchdev overlap with externallyManage mode in existing policy %s", previous.GetName()) + err = validateExternallyManage(current, previous) + if err != nil { + return err } // Check for overlapping ranges @@ -413,6 +408,37 @@ func validatePfNames(current *sriovnetworkv1.SriovNetworkNodePolicy, previous *s return nil } +func validateRootDevices(current *sriovnetworkv1.SriovNetworkNodePolicy, previous *sriovnetworkv1.SriovNetworkNodePolicy) error { + for _, curRootDevice := range current.Spec.NicSelector.RootDevices { + for _, preRootDevice := range previous.Spec.NicSelector.RootDevices { + // TODO: (SchSeba) implement range for root devices + if curRootDevice == preRootDevice { + return fmt.Errorf("root device %s is overlapped with existing policy %s", curRootDevice, previous.GetName()) + } + } + } + return nil +} + +func validateExternallyManage(current, previous *sriovnetworkv1.SriovNetworkNodePolicy) error { + // reject policy with externallyManage if there is a policy on the same PF without it + if current.Spec.ExternallyManaged != previous.Spec.ExternallyManaged { + return fmt.Errorf("externallyManage is inconsistent with existing policy %s", previous.GetName()) + } + + // reject policy with externallyManage if there is a policy on the same PF with switch dev + if current.Spec.ExternallyManaged && previous.Spec.EswitchMode == sriovnetworkv1.ESwithModeSwitchDev { + return fmt.Errorf("externallyManage overlap with switchdev mode in existing policy %s", previous.GetName()) + } + + // reject policy with externallyManage if there is a policy on the same PF with switch dev + if previous.Spec.ExternallyManaged && current.Spec.EswitchMode == sriovnetworkv1.ESwithModeSwitchDev { + return fmt.Errorf("switchdev overlap with externallyManage mode in existing policy %s", previous.GetName()) + } + + return nil +} + func validateExludeTopologyField(current *sriovnetworkv1.SriovNetworkNodePolicy, previous *sriovnetworkv1.SriovNetworkNodePolicy) error { if current.Spec.ResourceName != previous.Spec.ResourceName { return nil diff --git a/pkg/webhook/validate_test.go b/pkg/webhook/validate_test.go index 2f1f2ffab..ccb69a5b7 100644 --- a/pkg/webhook/validate_test.go +++ b/pkg/webhook/validate_test.go @@ -774,6 +774,31 @@ func TestValidatePoliciesWithSameExcludeTopologyForTheSameResource(t *testing.T) g.Expect(err).NotTo(HaveOccurred()) } +func TestValidatePoliciesWithDifferentNumVfForTheSameResourceAndTheSameRootDevice(t *testing.T) { + current := &SriovNetworkNodePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "currentPolicy"}, + Spec: SriovNetworkNodePolicySpec{ + ResourceName: "resourceX", + NumVfs: 10, + NicSelector: SriovNetworkNicSelector{RootDevices: []string{"0000:86:00.1"}}, + }, + } + + previous := &SriovNetworkNodePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "previousPolicy"}, + Spec: SriovNetworkNodePolicySpec{ + ResourceName: "resourceX", + NumVfs: 5, + NicSelector: SriovNetworkNicSelector{RootDevices: []string{"0000:86:00.1"}}, + }, + } + + err := validatePolicyForNodePolicy(current, previous) + + g := NewGomegaWithT(t) + g.Expect(err).To(MatchError("root device 0000:86:00.1 is overlapped with existing policy previousPolicy")) +} + func TestStaticValidateSriovNetworkNodePolicyWithValidVendorDevice(t *testing.T) { policy := &SriovNetworkNodePolicy{ Spec: SriovNetworkNodePolicySpec{ diff --git a/test/e2e/sriovoperatornodepolicy_test.go b/test/e2e/sriovoperatornodepolicy_test.go index ba2ec5e64..10eb86589 100644 --- a/test/e2e/sriovoperatornodepolicy_test.go +++ b/test/e2e/sriovoperatornodepolicy_test.go @@ -268,7 +268,7 @@ var _ = Describe("Operator", func() { Expect(iface.VfGroups[0].DeviceType).To(Equal(policy.Spec.DeviceType)) Expect(iface.VfGroups[0].ResourceName).To(Equal(policy.Spec.ResourceName)) - pfName, rngStart, rngEnd, err := sriovnetworkv1.ParsePFName(policy.Spec.NicSelector.PfNames[0]) + pfName, rngStart, rngEnd, err := sriovnetworkv1.ParseVfRange(policy.Spec.NicSelector.PfNames[0]) Expect(err).NotTo(HaveOccurred()) rng := strconv.Itoa(rngStart) + "-" + strconv.Itoa(rngEnd) Expect(iface.Name).To(Equal(pfName)) From 2e4e3ab1d2845220307959f5603a9db65852024c Mon Sep 17 00:00:00 2001 From: Yury Kulazhenkov Date: Tue, 9 Apr 2024 14:44:30 +0300 Subject: [PATCH 3/6] Fix service CMD to return an error if device discovery failed Signed-off-by: Yury Kulazhenkov --- cmd/sriov-network-config-daemon/service.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/sriov-network-config-daemon/service.go b/cmd/sriov-network-config-daemon/service.go index 1917ed63c..88b60036e 100644 --- a/cmd/sriov-network-config-daemon/service.go +++ b/cmd/sriov-network-config-daemon/service.go @@ -182,7 +182,7 @@ func callPlugin(setupLog logr.Logger, phase string, conf *systemd.SriovConfig, h nodeState, err := getNetworkNodeState(setupLog, conf, hostHelpers) if err != nil { - return nil + return err } _, _, err = configPlugin.OnNodeStateChange(nodeState) if err != nil { From 07254b5d6032f2650affb88d5c4f2a0e443a427e Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Tue, 16 Apr 2024 21:41:03 +0300 Subject: [PATCH 4/6] Fix config-daemon not waiting for drain to complete Signed-off-by: Sebastian Sch --- pkg/daemon/daemon.go | 33 ++++++++++++++++++++++----------- pkg/daemon/daemon_test.go | 15 +++++++++------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index a389c1758..07eefb4f8 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -484,10 +484,14 @@ func (dn *Daemon) nodeStateSyncHandler() error { if reqDrain || !utils.ObjectHasAnnotation(dn.desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainIdle) { - if err := dn.handleDrain(reqReboot); err != nil { + drainInProcess, err := dn.handleDrain(reqReboot) + if err != nil { log.Log.Error(err, "failed to handle drain") return err } + if drainInProcess { + return nil + } } if !reqReboot && !vars.UsingSystemdMode { @@ -560,20 +564,25 @@ func (dn *Daemon) nodeStateSyncHandler() error { return nil } -func (dn *Daemon) handleDrain(reqReboot bool) error { +// handleDrain: adds the right annotation to the node and nodeState object +// returns true if we need to finish the reconcile loop and wait for a new object +func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) { + // done with the drain we can continue with the configuration if utils.ObjectHasAnnotation(dn.desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) { log.Log.Info("handleDrain(): the node complete the draining") - return nil + return false, nil } + // the operator is still draining the node so we reconcile if utils.ObjectHasAnnotation(dn.desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.Draining) { log.Log.Info("handleDrain(): the node is still draining") - return nil + return true, nil } + // drain is disabled we continue with the configuration if dn.disableDrain { log.Log.Info("handleDrain(): drain is disabled in sriovOperatorConfig") - return nil + return false, nil } if reqReboot { @@ -581,33 +590,35 @@ func (dn *Daemon) handleDrain(reqReboot bool) error { err := utils.AnnotateNode(context.Background(), vars.NodeName, consts.NodeDrainAnnotation, consts.RebootRequired, dn.client) if err != nil { log.Log.Error(err, "applyDrainRequired(): Failed to annotate node") - return err + return false, err } log.Log.Info("handleDrain(): apply 'Reboot_Required' annotation for nodeState") if err := utils.AnnotateObject(context.Background(), dn.desiredNodeState, consts.NodeStateDrainAnnotation, consts.RebootRequired, dn.client); err != nil { - return err + return false, err } - return nil + // the node was annotated we need to wait for the operator to finish the drain + return true, nil } log.Log.Info("handleDrain(): apply 'Drain_Required' annotation for node") err := utils.AnnotateNode(context.Background(), vars.NodeName, consts.NodeDrainAnnotation, consts.DrainRequired, dn.client) if err != nil { log.Log.Error(err, "handleDrain(): Failed to annotate node") - return err + return false, err } log.Log.Info("handleDrain(): apply 'Drain_Required' annotation for nodeState") if err := utils.AnnotateObject(context.Background(), dn.desiredNodeState, consts.NodeStateDrainAnnotation, consts.DrainRequired, dn.client); err != nil { - return err + return false, err } - return nil + // the node was annotated we need to wait for the operator to finish the drain + return true, nil } func (dn *Daemon) restartDevicePluginPod() error { diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index 54b75e3e8..a1e29dcb2 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -194,8 +194,9 @@ var _ = Describe("Config Daemon", func() { nodeState := &sriovnetworkv1.SriovNetworkNodeState{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-node", - Generation: 123, + Name: "test-node", + Generation: 123, + Annotations: map[string]string{consts.NodeStateDrainAnnotationCurrent: consts.DrainIdle}, }, Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{}, Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ @@ -253,8 +254,9 @@ var _ = Describe("Config Daemon", func() { nodeState1 := &sriovnetworkv1.SriovNetworkNodeState{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-node", - Generation: 123, + Name: "test-node", + Generation: 123, + Annotations: map[string]string{consts.NodeStateDrainAnnotationCurrent: consts.DrainIdle}, }, } Expect( @@ -263,8 +265,9 @@ var _ = Describe("Config Daemon", func() { nodeState2 := &sriovnetworkv1.SriovNetworkNodeState{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-node", - Generation: 777, + Name: "test-node", + Generation: 777, + Annotations: map[string]string{consts.NodeStateDrainAnnotationCurrent: consts.DrainIdle}, }, } Expect( From aa1ba6530691a0af501091708bf6de3ab05f070c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Wed, 17 Apr 2024 15:13:59 +0200 Subject: [PATCH 5/6] Make OVS_CNI_IMAGE truly optional The `hack/env.sh` script expected that `OVS_CNI_IMAGE` is set, and exited with an error when it was not. There is no good reason for erroring out on missing variable for an optional image. Instead, set the variable to empty string when it is unset. When `SKIP_VAR_SET` is set, the default for `OVS_CNI_IMAGE` remains a valid image. --- hack/env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/env.sh b/hack/env.sh index eede3a1e5..26d9c767e 100755 --- a/hack/env.sh +++ b/hack/env.sh @@ -9,10 +9,10 @@ if [ -z $SKIP_VAR_SET ]; then export SRIOV_NETWORK_WEBHOOK_IMAGE=${SRIOV_NETWORK_WEBHOOK_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-operator-webhook} export SRIOV_NETWORK_OPERATOR_IMAGE=${SRIOV_NETWORK_OPERATOR_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-operator} else + # ensure that OVS_CNI_IMAGE is set, empty string is a valid value + OVS_CNI_IMAGE=${OVS_CNI_IMAGE:-} [ -z $SRIOV_CNI_IMAGE ] && echo "SRIOV_CNI_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_INFINIBAND_CNI_IMAGE ] && echo "SRIOV_INFINIBAND_CNI_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 - # check that OVS_CNI_IMAGE is set to any value, empty string is a valid value - [ -z ${OVS_CNI_IMAGE+set} ] && echo "OVS_CNI_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_DEVICE_PLUGIN_IMAGE ] && echo "SRIOV_DEVICE_PLUGIN_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $NETWORK_RESOURCES_INJECTOR_IMAGE ] && echo "NETWORK_RESOURCES_INJECTOR_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_NETWORK_CONFIG_DAEMON_IMAGE ] && echo "SRIOV_NETWORK_CONFIG_DAEMON_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 From 4afa11366c1437e37b20f31ca85a578f6d1f61a5 Mon Sep 17 00:00:00 2001 From: Sebastian Scheinkman Date: Thu, 18 Apr 2024 13:41:29 +0000 Subject: [PATCH 6/6] d/s run make bundle --- .../manifests/sriov-network-operator.clusterserviceversion.yaml | 2 +- .../stable/sriov-network-operator.clusterserviceversion.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml b/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml index 3c90c71ea..bbdff4d5b 100644 --- a/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml +++ b/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml @@ -100,7 +100,7 @@ metadata: categories: Networking certified: "false" containerImage: quay.io/openshift/origin-sriov-network-operator:4.16 - createdAt: "2024-04-18T09:27:02Z" + createdAt: "2024-04-18T13:41:29Z" description: An operator for configuring SR-IOV components and initializing SRIOV network devices in Openshift cluster. features.operators.openshift.io/cnf: "false" diff --git a/manifests/stable/sriov-network-operator.clusterserviceversion.yaml b/manifests/stable/sriov-network-operator.clusterserviceversion.yaml index 3c90c71ea..bbdff4d5b 100644 --- a/manifests/stable/sriov-network-operator.clusterserviceversion.yaml +++ b/manifests/stable/sriov-network-operator.clusterserviceversion.yaml @@ -100,7 +100,7 @@ metadata: categories: Networking certified: "false" containerImage: quay.io/openshift/origin-sriov-network-operator:4.16 - createdAt: "2024-04-18T09:27:02Z" + createdAt: "2024-04-18T13:41:29Z" description: An operator for configuring SR-IOV components and initializing SRIOV network devices in Openshift cluster. features.operators.openshift.io/cnf: "false"