From ec2cb3c9f8f1610a6ec14dfd6274449e4cc6e703 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 20 Jan 2022 19:41:32 +0200 Subject: [PATCH] Implement a rebind to default driver as a w/a This commit add a w/a to an issue observed on intel nics where not all the vfs are created. Test: ``` cat /tmp/2.sh set -ex while : do echo 5 > /sys/bus/pci/devices/0000\:86\:00.0/sriov_numvfs sleep 4 VFS=`ip l 2>/dev/null | grep ens7f0v | wc -l` if [[ $VFS != 5 ]]; then echo "bug!" sleep INF fi echo 0 > /sys/bus/pci/devices/0000\:86\:00.0/sriov_numvfs sleep 2 done /tmp/2.sh ++ : ++ echo 5 ++ sleep 4 +++ ip l +++ grep ens7f0v +++ wc -l ++ VFS=5 ++ [[ 5 != 5 ]] ++ echo 0 ++ sleep 2 ++ : ++ echo 5 ++ sleep 4 +++ ip l +++ grep ens7f0v +++ wc -l ++ VFS=5 ++ [[ 5 != 5 ]] ++ echo 0 ++ sleep 2 ++ : ++ echo 5 ++ sleep 4 +++ ip l +++ grep ens7f0v +++ wc -l ++ VFS=4 ++ [[ 4 != 5 ]] ++ echo 'bug!' bug! ++ sleep INF d8:00.0 Ethernet controller: Intel Corporation Ethernet Controller XXV710 for 25GbE SFP28 (rev 02) d8:00.1 Ethernet controller: Intel Corporation Ethernet Controller XXV710 for 25GbE SFP28 (rev 02) d8:02.0 Ethernet controller: Intel Corporation Ethernet Virtual Function 700 Series (rev 02) d8:02.1 Ethernet controller: Intel Corporation Ethernet Virtual Function 700 Series (rev 02) d8:02.2 Ethernet controller: Intel Corporation Ethernet Virtual Function 700 Series (rev 02) d8:02.3 Ethernet controller: Intel Corporation Ethernet Virtual Function 700 Series (rev 02) d8:02.4 Ethernet controller: Intel Corporation Ethernet Virtual Function 700 Series (rev 02) [root@cnfdt14 core]# lspci -v -mm -nn -k -s d8:00.0 Slot: d8:00.0 Class: Ethernet controller [0200] Vendor: Intel Corporation [8086] Device: Ethernet Controller XXV710 for 25GbE SFP28 [158b] SVendor: Intel Corporation [8086] SDevice: Ethernet 25G 2P XXV710 Adapter [0009] Rev: 02 Driver: i40e Module: i40e NUMANode: 1 ls -la /sys/bus/pci/devices/0000\:3b\:02.1/net/ ls: cannot access '/sys/bus/pci/devices/0000:3b:02.1/net/': No such file or directory echo "0000:3b:02.1" > /sys/bus/pci/drivers/iavf/unbind echo "0000:3b:02.1" > /sys/bus/pci/drivers/iavf/bind [ 336.586302] pci 0000:3b:02.1: [8086:154c] type 00 class 0x020000 [ 336.592352] pci 0000:3b:02.1: enabling Extended Tags [ 336.597653] pci 0000:3b:02.1: Adding to iommu group 156 [ 336.622048] iavf 0000:3b:02.1: enabling device (0000 -> 0002) [ 336.716570] iavf 0000:3b:02.1: Device is still in reset (-16), retrying [ 337.839372] iavf 0000:3b:02.1: Invalid MAC address 00:00:00:00:00:00, using random [ 337.848651] iavf 0000:3b:02.1: Multiqueue Enabled: Queue pair count = 4 [ 337.965036] iavf 0000:3b:02.1: MAC address: 92:67:37:fd:42:25 [ 337.972234] iavf 0000:3b:02.1: GRO is enabled [ 338.004527] iavf 0000:3b:02.1 ens1f0v1: renamed from eth0 [ 338.195468] iavf 0000:3b:02.1: Reset warning received from the PF [ 338.211038] iavf 0000:3b:02.1: Scheduling reset task [ 353.590034] pci 0000:3b:02.1: Removing from iommu group 156 [ 366.167547] pci 0000:3b:02.1: [8086:154c] type 00 class 0x020000 [ 366.174623] pci 0000:3b:02.1: enabling Extended Tags [ 366.180037] pci 0000:3b:02.1: Adding to iommu group 156 [ 366.185485] iavf 0000:3b:02.1: enabling device (0000 -> 0002) [ 366.265071] iavf 0000:3b:02.1: Device is still in reset (-16), retrying [ 1255.037432] iavf 0000:3b:02.1: enabling device (0000 -> 0002) [ 1255.110439] iavf 0000:3b:02.1: Invalid MAC address 00:00:00:00:00:00, using random [ 1255.118153] iavf 0000:3b:02.1: Multiqueue Enabled: Queue pair count = 4 [ 1255.125217] iavf 0000:3b:02.1: MAC address: d2:73:2c:3c:ed:f8 [ 1255.130984] iavf 0000:3b:02.1: GRO is enabled [ 1255.135615] iavf 0000:3b:02.1 ens1f0v1: renamed from eth0 [ 1255.792238] iavf 0000:3b:02.1: Reset warning received from the PF [ 1255.798351] iavf 0000:3b:02.1: Scheduling reset task [ 1268.465116] iavf 0000:3b:02.1: Reset warning received from the PF [ 1268.479921] iavf 0000:3b:02.1: Scheduling reset task ``` We are working with kernel developers to find and fix the issue. This w/a will give us more time without impacting the sriov operator Signed-off-by: Sebastian Sch --- pkg/utils/utils.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index ca8ad4f10b..78cfe77f4c 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -565,8 +565,23 @@ func setVfAdminMac(vfAddr string, pfLink netlink.Link) error { } vfLink, err := vfIsReady(vfAddr) if err != nil { - glog.Errorf("setVfAdminMac(): VF link is not ready for device %+v %q", vfAddr, err) - return err + // TODO: remove workaround after intel fix driver issue + if err := Unbind(vfAddr); err != nil { + return err + } + if err := BindDefaultDriver(vfAddr); err != nil { + glog.Warningf("setVfAdminMac(): fail to bind default driver for device %s", vfAddr) + return err + } + + // Try to check the VF status again + vfLink, err = vfIsReady(vfAddr) + if err != nil { + glog.Errorf("setVfAdminMac(): VF link is not ready for device %s %q", vfAddr, err) + return err + } + + glog.Errorf("setVfAdminMac(): workaround implement for VF %s", vfAddr) } if err := netlink.LinkSetVfHardwareAddr(pfLink, vfID, vfLink.Attrs().HardwareAddr); err != nil { return err