From 53da46be0c8e7abd321eddd6389d58078f8c78f8 Mon Sep 17 00:00:00 2001 From: sushrk Date: Fri, 29 Nov 2024 00:04:52 +0000 Subject: [PATCH] Remove PingWithTimeout test in node controller and manager cr: https://code.amazon.com/reviews/CR-162829033 --- controllers/core/node_controller.go | 31 +---------------------------- pkg/aws/ec2/api/wrapper.go | 2 +- pkg/config/loader.go | 10 +++++----- pkg/node/manager/manager.go | 18 ++--------------- 4 files changed, 9 insertions(+), 52 deletions(-) diff --git a/controllers/core/node_controller.go b/controllers/core/node_controller.go index 8a1f8b0e..5100b4b8 100644 --- a/controllers/core/node_controller.go +++ b/controllers/core/node_controller.go @@ -15,7 +15,6 @@ package controllers import ( "context" - "net/http" "time" "github.com/aws/amazon-vpc-resource-controller-k8s/apis/vpcresources/v1alpha1" @@ -23,13 +22,11 @@ import ( rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz" "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s" "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager" - "github.com/google/uuid" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -124,31 +121,5 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager, healthzHandler *rcHe func (r *NodeReconciler) Check() healthz.Checker { r.Log.Info("Node controller's healthz subpath was added") - return func(req *http.Request) error { - // if the reconciler is not ready, using the simple ping to test - // this can test the referenced cached pod datastore - if !r.Conditions.GetPodDataStoreSyncStatus() { - r.Log.V(1).Info("***** node controller healthz enpoint tested Simple Ping *****") - return nil - } - - err := rcHealthz.PingWithTimeout(func(c chan<- error) { - // when the reconciler is ready, testing the reconciler with a fake node request - pingRequest := &ctrl.Request{ - NamespacedName: types.NamespacedName{ - Namespace: corev1.NamespaceDefault, - Name: uuid.New().String(), - }, - } - - // expecting to 'return ctrl.Result{}, client.IgnoreNotFound(err)' - // IgnoreNotFound returns nil on NotFound errors. - // this can test the pod cached datastore and node cached datastore - _, rErr := r.Reconcile(r.Context, *pingRequest) - r.Log.V(1).Info("***** node controller healthz endpoint tested Reconcile *****") - c <- rErr - }, r.Log) - - return err - } + return rcHealthz.SimplePing("node reconciler", r.Log) } diff --git a/pkg/aws/ec2/api/wrapper.go b/pkg/aws/ec2/api/wrapper.go index 7c7fdc78..31341f8e 100644 --- a/pkg/aws/ec2/api/wrapper.go +++ b/pkg/aws/ec2/api/wrapper.go @@ -676,7 +676,7 @@ func (e *ec2Wrapper) DescribeNetworkInterfacesPages(input *ec2.DescribeNetworkIn }) } // Add jitter to avoid EC2 API throttling in the account - time.Sleep(wait.Jitter(500*time.Millisecond, 0.5)) + time.Sleep(wait.Jitter(300*time.Millisecond, 0.5)) return true }); err != nil { diff --git a/pkg/config/loader.go b/pkg/config/loader.go index 90d1b61d..933a2a75 100644 --- a/pkg/config/loader.go +++ b/pkg/config/loader.go @@ -49,15 +49,15 @@ const ( // Tested: 12 + 8 limits (not seeing significant degradation from 15+8) // Larger number seems not make latency better than 12+8 UserServiceClientQPS = 12 - UserServiceClientQPSBurst = 8 + UserServiceClientQPSBurst = 18 // EC2 API QPS for instance service client - InstanceServiceClientQPS = 5 - InstanceServiceClientBurst = 7 + InstanceServiceClientQPS = 12 + InstanceServiceClientBurst = 18 // API Server QPS - DefaultAPIServerQPS = 10 - DefaultAPIServerBurst = 15 + DefaultAPIServerQPS = 20 + DefaultAPIServerBurst = 30 ) // LoadResourceConfig returns the Resource Configuration for all resources managed by the VPC Resource Controller. Currently diff --git a/pkg/node/manager/manager.go b/pkg/node/manager/manager.go index c9b6f9c2..92e7057b 100644 --- a/pkg/node/manager/manager.go +++ b/pkg/node/manager/manager.go @@ -16,7 +16,6 @@ package manager import ( "errors" "fmt" - "net/http" "strings" "sync" "time" @@ -30,7 +29,6 @@ import ( "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/resource" "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/utils" asyncWorker "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/worker" - "github.com/google/uuid" "github.com/samber/lo" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" @@ -559,18 +557,6 @@ func (m *manager) removeNodeSafe(nodeName string) { } func (m *manager) check() healthz.Checker { - // instead of using SimplePing, testing the node cache from manager makes the test more accurate - return func(req *http.Request) error { - err := rcHealthz.PingWithTimeout(func(c chan<- error) { - randomName := uuid.New().String() - _, found := m.GetNode(randomName) - m.Log.V(1).Info("health check tested ping GetNode to check on datastore cache in node manager successfully", "TesedNodeName", randomName, "NodeFound", found) - var ping interface{} - m.worker.SubmitJob(ping) - m.Log.V(1).Info("health check tested ping SubmitJob with a nil job to check on worker queue in node manager successfully") - c <- nil - }, m.Log) - - return err - } + // Use SimplePing in v1.4.6-internal patch + return rcHealthz.SimplePing("node manager", m.Log) }