From d22343c69078eb7e9f017f6a8919d1bf61a96671 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda Date: Wed, 18 Dec 2024 15:13:52 -0800 Subject: [PATCH] Add Support for Node Monitoring Agent --- pkg/cloudprovider/cloudprovider.go | 29 ++++++++++++++++++- test/suites/integration/repair_policy_test.go | 27 +++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 3fe6b6f2706c..96bb2c1ea831 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -259,7 +259,7 @@ func getTags(ctx context.Context, nodeClass *v1.EC2NodeClass, nodeClaim *karpv1. func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy { return []cloudprovider.RepairPolicy{ - // Supported Kubelet fields + // Supported Kubelet Node Conditions { ConditionType: corev1.NodeReady, ConditionStatus: corev1.ConditionFalse, @@ -270,6 +270,33 @@ func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy { ConditionStatus: corev1.ConditionUnknown, TolerationDuration: 30 * time.Minute, }, + // Support Node Monitoring Agent Conditions + // + { + ConditionType: "AcceleratedHardwareReady", + ConditionStatus: corev1.ConditionFalse, + TolerationDuration: 10 * time.Minute, + }, + { + ConditionType: "StorageReady", + ConditionStatus: corev1.ConditionFalse, + TolerationDuration: 30 * time.Minute, + }, + { + ConditionType: "NetworkingReady", + ConditionStatus: corev1.ConditionFalse, + TolerationDuration: 30 * time.Minute, + }, + { + ConditionType: "KernelReady", + ConditionStatus: corev1.ConditionFalse, + TolerationDuration: 30 * time.Minute, + }, + { + ConditionType: "ContainerRuntimeReady", + ConditionStatus: corev1.ConditionFalse, + TolerationDuration: 30 * time.Minute, + }, } } diff --git a/test/suites/integration/repair_policy_test.go b/test/suites/integration/repair_policy_test.go index 56d464d972b8..991b7c11f303 100644 --- a/test/suites/integration/repair_policy_test.go +++ b/test/suites/integration/repair_policy_test.go @@ -73,6 +73,7 @@ var _ = Describe("Repair Policy", func() { env.EventuallyExpectNotFound(pod, node) env.EventuallyExpectHealthyPodCount(selector, numPods) }, + // Kubelet Supported Conditions Entry("Node Ready False", corev1.NodeCondition{ Type: corev1.NodeReady, Status: corev1.ConditionFalse, @@ -83,6 +84,32 @@ var _ = Describe("Repair Policy", func() { Status: corev1.ConditionUnknown, LastTransitionTime: metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, }), + // Node Monitoring Agent Supported Conditions + Entry("Node AcceleratedHardwareReady False", corev1.NodeCondition{ + Type: "AcceleratedHardwareReady", + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: time.Now().Add(-11 * time.Minute)}, + }), + Entry("Node StorageReady False", corev1.NodeCondition{ + Type: "StorageReady", + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + }), + Entry("Node NetworkingReady False", corev1.NodeCondition{ + Type: "NetworkingReady", + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + }), + Entry("Node KernelReady False", corev1.NodeCondition{ + Type: "KernelReady" + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + }), + Entry("Node ContainerRuntimeReady False", corev1.NodeCondition{ + Type: "ContainerRuntimeReady", + Status: corev1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + }), ) It("should ignore disruption budgets", func() { nodePool.Spec.Disruption.Budgets = []karpenterv1.Budget{