Skip to content

Commit

Permalink
config-daemon: Restart all instances of device-plugin
Browse files Browse the repository at this point in the history
When the operator changes the device-plugin Spec (e.g. .Spec.NodeSelector), it may happen
that there are two device plugin pods for a given node, one that is terminating, the other that is
initializing.
If the config-daemon executes `restartDevicePluginPod()` at the same time, it may kill the terminating
pod, while the initializing one will run with the old dp configuration. This may cause one or more resources
to not being advertised, until a manual device plugin restart occurs.

Make the config-daemon restart all the device-plugin instances it founds for its own node.

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Oct 10, 2024
1 parent 40fc75e commit af40ada
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 40 deletions.
53 changes: 27 additions & 26 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ func New(
workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, maxUpdateBackoff)), "SriovNetworkNodeState"),
eventRecorder: er,
disabledPlugins: disabledPlugins,
mu: &sync.Mutex{},
}
}

Expand Down Expand Up @@ -153,7 +154,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {

var timeout int64 = 5
var metadataKey = "metadata.name"
dn.mu = &sync.Mutex{}
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
time.Second*15,
vars.Namespace,
Expand Down Expand Up @@ -669,7 +669,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
defer dn.mu.Unlock()
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")

var podToDelete string
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=" + vars.NodeName,
Expand All @@ -688,35 +687,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return nil
}
podToDelete = pods.Items[0].Name

log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
return nil
}
if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
for _, pod := range pods.Items {
podToDelete := pod.Name
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
continue
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}

return nil
Expand Down
61 changes: 47 additions & 14 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
)

var SriovDevicePluginPod corev1.Pod

func TestConfigDaemon(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Config Daemon Suite")
Expand Down Expand Up @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
},
}

SriovDevicePluginPod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}

err = sriovnetworkv1.AddToScheme(scheme.Scheme)
Expect(err).ToNot(HaveOccurred())
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
Expand All @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
Namespace: vars.Namespace,
}}).Build()

kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
snclient := snclientset.NewSimpleClientset()
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -172,6 +161,22 @@ var _ = Describe("Config Daemon", func() {
err := sut.Run(stopCh, exitCh)
Expect(err).ToNot(HaveOccurred())
}()

SriovDevicePluginPod = corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

})

AfterEach(func() {
Expand Down Expand Up @@ -283,6 +288,34 @@ var _ = Describe("Config Daemon", func() {

Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
})

It("restart all the sriov-device-plugin pods present on the node", func() {
otherPod1 := SriovDevicePluginPod.DeepCopy()
otherPod1.Name = "sriov-device-plugin-xxxa"
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

otherPod2 := SriovDevicePluginPod.DeepCopy()
otherPod2.Name = "sriov-device-plugin-xxxz"
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

err = sut.restartDevicePluginPod()
Expect(err).ToNot(HaveOccurred())

Eventually(func() (int, error) {
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=test-node",
})

if err != nil {
return 0, err
}

return len(podList.Items), nil
}, "1s").Should(BeZero())
})
})
})

Expand Down

0 comments on commit af40ada

Please sign in to comment.