Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a rebind to default driver as a w/a #233

Merged
merged 2 commits into from
Feb 10, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 58 additions & 17 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
for _, addr := range vfAddrs {
var group sriovnetworkv1.VfGroup
i := 0
var driver string
var dpdkDriver string
var isRdma bool
vfID, err := dputils.GetVFID(addr)
for i, group = range iface.VfGroups {
Expand All @@ -282,22 +282,50 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
if sriovnetworkv1.IndexInRange(vfID, group.VfRange) {
isRdma = group.IsRdma
if sriovnetworkv1.StringInArray(group.DeviceType, DpdkDrivers) {
driver = group.DeviceType
dpdkDriver = group.DeviceType
}
break
}
}
if strings.EqualFold(iface.LinkType, "IB") {
if err = setVfGuid(addr, pfLink); err != nil {
return err

// only set GUID and MAC for VF with default driver
// for userspace drivers like vfio we configure the vf mac using the kernel nic mac address
// before we switch to the userspace driver
if yes, d := hasDriver(addr); yes && !sriovnetworkv1.StringInArray(d, DpdkDrivers) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/cc @adrianchiris @mskrocki @pliurh @zshi-redhat

I add this validation instead of if iface.NumVfs != ifaceStatus.NumVfs that was proposed under #245.

I think is a better solution because we are inside a for loop going over all the vfs. if there is an issue with one of the vfs(for example the intel driver get stuck) we exist the configSriovDevice function with an error. Then in the next reconcile the iface.NumVfs will be equal to the ifaceStatus.NumVfs because the vfs got already created here (https://github.com/k8snetworkplumbingwg/sriov-network-operator/pull/233/files#diff-81ddbadfb415ccbb9c7af84f11668d1aa5e53c34025bf86d4702f16b4e42f045R246) but we didn't really finish allocating the GUID or the mac address to all the vfs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree.

if strings.EqualFold(iface.LinkType, "IB") {
if err = setVfGuid(addr, pfLink); err != nil {
return err
}
} else {
vfLink, err := vfIsReady(addr)
if err != nil {
glog.Errorf("configSriovDevice(): VF link is not ready for device %s %q", addr, err)
err = RebindVfToDefaultDriver(addr)
if err != nil {
glog.Errorf("configSriovDevice(): failed to rebind VF %s %q", addr, err)
return err
}

// Try to check the VF status again
vfLink, err = vfIsReady(addr)
if err != nil {
glog.Errorf("configSriovDevice(): VF link is not ready for device %s %q", addr, err)
return err
}

}
if err = setVfAdminMac(addr, pfLink, vfLink); err != nil {
glog.Errorf("configSriovDevice(): fail to configure VF admin mac address for device %s %q", addr, err)
return err
}
}
} else if err = setVfAdminMac(addr, pfLink); err != nil {
return err
}

if err = unbindDriverIfNeeded(addr, isRdma); err != nil {
return err
}
if driver == "" {

if dpdkDriver == "" {
if err := BindDefaultDriver(addr); err != nil {
glog.Warningf("configSriovDevice(): fail to bind default driver for device %s", addr)
return err
Expand All @@ -310,8 +338,8 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
}
}
} else {
if err := BindDpdkDriver(addr, driver); err != nil {
glog.Warningf("configSriovDevice(): fail to bind driver %s for device %s", driver, addr)
if err := BindDpdkDriver(addr, dpdkDriver); err != nil {
glog.Warningf("configSriovDevice(): fail to bind driver %s for device %s", dpdkDriver, addr)
return err
}
}
Expand Down Expand Up @@ -541,7 +569,7 @@ func vfIsReady(pciAddr string) (netlink.Link, error) {
glog.Infof("vfIsReady(): VF device %s", pciAddr)
var err error
var vfLink netlink.Link
err = wait.PollImmediate(time.Second, 5*time.Second, func() (bool, error) {
err = wait.PollImmediate(time.Second, 10*time.Second, func() (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to increase the timeout?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the timeout was too short and was generating spurious errors.

vfName := tryGetInterfaceName(pciAddr)
vfLink, err = netlink.LinkByName(vfName)
if err != nil {
Expand All @@ -555,19 +583,15 @@ func vfIsReady(pciAddr string) (netlink.Link, error) {
return vfLink, nil
}

func setVfAdminMac(vfAddr string, pfLink netlink.Link) error {
func setVfAdminMac(vfAddr string, pfLink, vfLink netlink.Link) error {
glog.Infof("setVfAdminMac(): VF %s", vfAddr)

vfID, err := dputils.GetVFID(vfAddr)
if err != nil {
glog.Errorf("setVfAdminMac(): unable to get VF id %+v %q", vfAddr, err)
return err
}
vfLink, err := vfIsReady(vfAddr)
if err != nil {
glog.Errorf("setVfAdminMac(): VF link is not ready for device %+v %q", vfAddr, err)
return err
}

if err := netlink.LinkSetVfHardwareAddr(pfLink, vfID, vfLink.Attrs().HardwareAddr); err != nil {
return err
}
Expand Down Expand Up @@ -722,3 +746,20 @@ func hasMellanoxInterfacesInSpec(newState *sriovnetworkv1.SriovNetworkNodeState)
}
return false
}

// Workaround function to handle a case where the vf default driver is stuck and not able to create the vf kernel interface.
// This function unbind the VF from the default driver and try to bind it again
// bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2045087
func RebindVfToDefaultDriver(vfAddr string) error {
glog.Infof("RebindVfToDefaultDriver(): VF %s", vfAddr)
if err := Unbind(vfAddr); err != nil {
return err
}
if err := BindDefaultDriver(vfAddr); err != nil {
glog.Errorf("RebindVfToDefaultDriver(): fail to bind default driver for device %s", vfAddr)
return err
}

glog.Warningf("RebindVfToDefaultDriver(): workaround implemented for VF %s", vfAddr)
return nil
}