From 53d4006782a8f1ec8513cef37a49ba8f3662019d Mon Sep 17 00:00:00 2001
From: Max Cao <macao@redhat.com>
Date: Tue, 14 Jan 2025 18:23:07 -0800
Subject: [PATCH] VPA: fix logs and cleanup TODOs according to review

Signed-off-by: Max Cao <macao@redhat.com>
---
 .../eviction/pods_eviction_restriction.go     | 25 +++++++-------
 .../pkg/updater/logic/updater.go              | 34 +++++++++----------
 .../updater/priority/priority_processor.go    |  4 +--
 .../priority/update_priority_calculator.go    | 13 ++++---
 4 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go b/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go
index e553907b9728..a433bda5e834 100644
--- a/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go
+++ b/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go
@@ -124,7 +124,7 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
 			// If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
 			if IsInPlaceUpdating(pod) {
 				klog.V(4).InfoS("Pod disruption tolerance",
-					"pod", pod.Name,
+					"pod", klog.KObj(pod),
 					"running", singleGroupStats.running,
 					"configured", singleGroupStats.configured,
 					"tolerance", singleGroupStats.evictionTolerance,
@@ -132,10 +132,10 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
 					"updating", singleGroupStats.inPlaceUpdating)
 
 				if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive {
-					klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name)
+					klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod))
 
 					if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred {
-						klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name)
+						klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod))
 						return true
 					}
 				}
@@ -447,13 +447,13 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
 		// If our QoS class is guaranteed, we can't change the resources without a restart
 		// TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
 		if pod.Status.QOSClass == apiv1.PodQOSGuaranteed {
-			klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass)
+			klog.V(4).InfoS("Can't resize pod in-place", "pod", klog.KObj(pod), "qosClass", pod.Status.QOSClass)
 			return false
 		}
 
 		// If we're already resizing this pod, don't do it again
 		if IsInPlaceUpdating(pod) {
-			klog.Warningf("Not resizing %s, already resizing it", pod.Name)
+			klog.V(4).InfoS("Not resizing pod again, because we are already resizing it", "pod", klog.KObj(pod))
 			return false
 		}
 
@@ -468,7 +468,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
 			// TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
 			for _, policy := range container.ResizePolicy {
 				if policy.RestartPolicy != apiv1.NotRequired {
-					klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy)
+					klog.InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "restartPolicy", policy.RestartPolicy)
 					// TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
 					if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever {
 						klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy)
@@ -481,7 +481,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
 
 		// If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
 		if noRestartPoliciesPopulated {
-			klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name)
+			klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod))
 		}
 
 		//TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
@@ -489,7 +489,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
 		// If we're pending, we can't in-place resize
 		// TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
 		if pod.Status.Phase == apiv1.PodPending {
-			klog.V(4).Infof("Can't resize pending pod %s", pod.Name)
+			klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod))
 			return false
 		}
 		// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
@@ -550,7 +550,8 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
 	for i, calculator := range e.patchCalculators {
 		p, err := calculator.CalculatePatches(podToUpdate, vpa)
 		if err != nil {
-			return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
+			klog.ErrorS(err, "failed to calculate resource patch for pod", "pod", klog.KObj(podToUpdate))
+			return err
 		}
 		klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
 		// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
@@ -564,7 +565,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
 	if len(resourcePatches) > 0 {
 		patch, err := json.Marshal(resourcePatches)
 		if err != nil {
-			klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
+			klog.ErrorS(err, "Cannot marshal the patch %v", resourcePatches)
 			return err
 		}
 
@@ -579,7 +580,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
 		if len(annotationPatches) > 0 {
 			patch, err := json.Marshal(annotationPatches)
 			if err != nil {
-				klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
+				klog.ErrorS(err, "Cannot marshal the patch %v", annotationPatches)
 				return err
 			}
 			res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
@@ -611,7 +612,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
 			e.creatorToSingleGroupStatsMap[cr] = singleGroupStats
 		}
 	} else {
-		klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase)
+		klog.InfoS("Attempted to update", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase)
 	}
 
 	return nil
diff --git a/vertical-pod-autoscaler/pkg/updater/logic/updater.go b/vertical-pod-autoscaler/pkg/updater/logic/updater.go
index 43133acd5ab7..f020fd122b34 100644
--- a/vertical-pod-autoscaler/pkg/updater/logic/updater.go
+++ b/vertical-pod-autoscaler/pkg/updater/logic/updater.go
@@ -247,20 +247,20 @@ func (u *updater) RunOnce(ctx context.Context) {
 
 			fallBackToEviction, err := u.AttemptInPlaceScalingIfPossible(ctx, vpaSize, vpa, pod, evictionLimiter, vpasWithInPlaceUpdatablePodsCounter, vpasWithInPlaceUpdatedPodsCounter)
 			if err != nil {
-				klog.Warningf("error attemptng to scale pod %v in-place: %v", pod.Name, err)
+				klog.ErrorS(err, "error attemptng to scale pod in-place", "pod", klog.KObj(pod))
 				return
 			}
 			// If in-place scaling was possible, and it isn't stuck, then skip eviction
 			if fallBackToEviction {
 				// TODO(jkyros): this needs to be cleaner, but we absolutely need to make sure a disruptionless update doesn't "sneak through"
 				if prioritizedPod.IsDisruptionless() {
-					klog.Infof("Not falling back to eviction, %v was supposed to be disruptionless", pod.Name)
+					klog.InfoS("Not falling back to eviction, pod was supposed to be disruptionless", "pod", klog.KObj(pod))
 					continue
 				} else {
-					klog.V(4).Infof("Falling back to eviction for %s", pod.Name)
+					klog.V(4).InfoS("Falling back to eviction for pod", "pod", klog.KObj(pod))
 				}
 			} else {
-				klog.Infof("Not falling back to eviction, because we don't have a recommendation yet? %v", pod.Name)
+				klog.InfoS("Not falling back to eviction, because we don't have a recommendation yet?", "pod", klog.KObj(pod))
 				continue
 			}
 
@@ -392,7 +392,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
 	// queue this for in-place before the VPA has made a recommendation.
 
 	if !VpaRecommendationProvided(vpa) {
-		klog.V(4).Infof("VPA hasn't made a recommendation yet, we're early on %s for some reason", pod.Name)
+		klog.V(4).InfoS("VPA hasn't made a recommendation for the pod yet", "pod", klog.KObj(pod))
 		// TODO(jkyros): so we must have had some erroneous evictions before, but we were passing the test suite? But for now if I want to test
 		// in-place I need it to not evict immediately if I can't in-place (because then it will never in-place)
 		fallBackToEviction = false
@@ -405,7 +405,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
 		withInPlaceUpdatable := false
 		withInPlaceUpdated := false
 
-		klog.V(4).Infof("Looks like we might be able to in-place update %s..", pod.Name)
+		klog.V(4).InfoS("Looks like we might be able to in-place update pod...", "pod", klog.KObj(pod))
 		withInPlaceUpdatable = true
 		// If I can't update
 		if !evictionLimiter.CanInPlaceUpdate(pod) {
@@ -413,14 +413,14 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
 			if !eviction.IsInPlaceUpdating(pod) {
 				// But it's not in-place updating, something went wrong (e.g. the operation would change pods QoS)
 				// fall back to eviction
-				klog.V(4).Infof("Can't in-place update pod %s, falling back to eviction, it might say no", pod.Name)
+				klog.V(4).InfoS("Can't in-place update pod, falling back to eviction, it might say no", "pod", klog.KObj(pod))
 				fallBackToEviction = true
 				return
 			}
 
 			lastUpdateTime, exists := u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)]
 			if !exists {
-				klog.V(4).Infof("In-place update in progress for %s/%s, but no lastInPlaceUpdateTime found, setting it to now", pod.Namespace, pod.Name)
+				klog.V(4).InfoS("In-place update in progress for pod but no lastInPlaceUpdateTime found, setting it to now", "pod", klog.KObj(pod))
 				lastUpdateTime = time.Now()
 				u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = lastUpdateTime
 			}
@@ -432,23 +432,23 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
 			switch pod.Status.Resize {
 			case apiv1.PodResizeStatusDeferred:
 				if time.Since(lastUpdateTime) > DeferredResizeUpdateTimeout {
-					klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", pod.Name)
+					klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", klog.KObj(pod))
 					fallBackToEviction = true
 				} else {
-					klog.V(4).Infof("In-place update deferred for %s, NOT falling back to eviction yet", pod.Name)
+					klog.V(4).InfoS("In-place update deferred, NOT falling back to eviction yet", "pod", klog.KObj(pod))
 				}
 			case apiv1.PodResizeStatusInProgress:
 				if time.Since(lastUpdateTime) > InProgressResizeUpdateTimeout {
-					klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", pod.Name)
+					klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", klog.KObj(pod))
 					fallBackToEviction = true
 				} else {
-					klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", pod.Name)
+					klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", klog.KObj(pod))
 				}
 			case apiv1.PodResizeStatusInfeasible:
-				klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", pod.Name)
+				klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", klog.KObj(pod))
 				fallBackToEviction = true
 			default:
-				klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", pod.Name)
+				klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", klog.KObj(pod))
 				fallBackToEviction = true
 			}
 			return
@@ -458,15 +458,15 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
 		err = u.evictionRateLimiter.Wait(ctx)
 		if err != nil {
 			// TODO(jkyros): whether or not we fall back to eviction here probably depends on *why* we failed
-			klog.Warningf("updating pod %v failed: %v", pod.Name, err)
+			klog.ErrorS(err, "updating pod failed", "pod", klog.KObj(pod))
 			return
 		}
 
-		klog.V(2).Infof("attempting to in-place update pod %v", pod.Name)
+		klog.V(2).InfoS("attempting to in-place update pod", "pod", klog.KObj(pod))
 		u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = time.Now()
 		evictErr := evictionLimiter.InPlaceUpdate(pod, vpa, u.eventRecorder)
 		if evictErr != nil {
-			klog.Warningf("updating pod %v failed: %v", pod.Name, evictErr)
+			klog.ErrorS(evictErr, "updating pod failed", "pod", klog.KObj(pod))
 		} else {
 			// TODO(jkyros): come back later for stats
 			withInPlaceUpdated = false
diff --git a/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go b/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go
index de9cc267f4f3..317c7d5dc7f3 100644
--- a/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go
+++ b/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go
@@ -89,9 +89,9 @@ func (*defaultPriorityProcessor) GetUpdatePriority(pod *apiv1.Pod, vpa *vpa_type
 							// It's okay if we're actually still resizing, but if we can't now or we're stuck, make sure the pod
 							// is still in the list so we can evict it to go live on a fatter node or something
 							if pod.Status.Resize == apiv1.PodResizeStatusDeferred || pod.Status.Resize == apiv1.PodResizeStatusInfeasible {
-								klog.V(4).Infof("Pod %s looks like it's stuck scaling up (%v state), leaving it in for eviction", pod.Name, pod.Status.Resize)
+								klog.V(4).InfoS("Pod looks like it's stuck scaling up, leaving it in for eviction", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
 							} else {
-								klog.V(4).Infof("Pod %s is in the process of scaling up (%v state), leaving it in so we can see if it's taking too long", pod.Name, pod.Status.Resize)
+								klog.V(4).InfoS("Pod is in the process of scaling up, leaving it in so we can see if it's taking too long", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
 							}
 						}
 						// I guess if it's not outside of compliance, it's probably okay it's stuck here?
diff --git a/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go b/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go
index aafd33d98cda..e87ba5bd1a3d 100644
--- a/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go
+++ b/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go
@@ -138,15 +138,19 @@ func (calc *UpdatePriorityCalculator) AddPod(pod *apiv1.Pod, now time.Time) {
 			// TODO(jkyros): do we need an in-place update threshold arg ?
 			// If our recommendations are disruptionless, we can bypass the threshold limit
 			if len(disruptionlessRecommendation.ContainerRecommendations) > 0 {
-				klog.V(2).Infof("Short-lived, but pod still accepted for DISRUPTIONLESS (%d/%d) in-place update %v/%v with priority %v", len(disruptionlessRecommendation.ContainerRecommendations), len(processedRecommendation.ContainerRecommendations), pod.Namespace, pod.Name, updatePriority.ResourceDiff)
+				klog.V(2).InfoS("Short-lived, but pod still accepted for disruptionless in-place update",
+					"pod", klog.KObj(pod),
+					"numContainers", len(pod.Spec.Containers),
+					"resourceDiff", updatePriority.ResourceDiff,
+					"fractionOfDisruptionlessRecommendations", len(disruptionlessRecommendation.ContainerRecommendations)/len(processedRecommendation.ContainerRecommendations),
+				)
 				updatePriority.Disruptionless = true
 				calc.pods = append(calc.pods, PrioritizedPod{
 					pod:            pod,
 					priority:       updatePriority,
 					recommendation: disruptionlessRecommendation})
 			} else {
-				// if it's not disruptionless, we fallback to the Recreate conditions which already failed
-				// (quick oom, outside recommended range, long-lived + significant change), so don't update this pod
+				// we cannot perform this update disruption-free, so do not update this pod's resources
 				klog.V(4).InfoS("Not updating a short-lived pod, request within recommended range", "pod", klog.KObj(pod))
 			}
 			return
@@ -176,7 +180,7 @@ func (calc *UpdatePriorityCalculator) GetSortedPrioritizedPods(admission PodEvic
 		if admission.Admit(podPrio.pod, podPrio.recommendation) {
 			result = append(result, &calc.pods[num])
 		} else {
-			klog.V(2).Infof("pod removed from update queue by PodEvictionAdmission: %v", podPrio.pod.Name)
+			klog.V(2).InfoS("Pod removed from update queue by PodEvictionAdmission", "pod", klog.KObj(podPrio.pod))
 		}
 	}
 
@@ -325,6 +329,7 @@ func (calc *UpdatePriorityCalculator) CalculateDisruptionFreeActions(pod *apiv1.
 			resourceRestartPolicy := getRestartPolicyForResource(resource, container.ResizePolicy)
 			// If we don't have one, that's probably bad
 			if resourceRestartPolicy == nil {
+				klog.V(4).InfoS("Container does not have a resourceResizeRestartPolicy", "pod", klog.KObj(pod), "container", container.Name)
 				continue
 			}
 			// If we do have one, and it's disruptive, then we know this won't work