From 53d4006782a8f1ec8513cef37a49ba8f3662019d Mon Sep 17 00:00:00 2001 From: Max Cao Date: Tue, 14 Jan 2025 18:23:07 -0800 Subject: [PATCH] VPA: fix logs and cleanup TODOs according to review Signed-off-by: Max Cao --- .../eviction/pods_eviction_restriction.go | 25 +++++++------- .../pkg/updater/logic/updater.go | 34 +++++++++---------- .../updater/priority/priority_processor.go | 4 +-- .../priority/update_priority_calculator.go | 13 ++++--- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go b/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go index e553907b9728..a433bda5e834 100644 --- a/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go +++ b/vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go @@ -124,7 +124,7 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool { // If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it. if IsInPlaceUpdating(pod) { klog.V(4).InfoS("Pod disruption tolerance", - "pod", pod.Name, + "pod", klog.KObj(pod), "running", singleGroupStats.running, "configured", singleGroupStats.configured, "tolerance", singleGroupStats.evictionTolerance, @@ -132,10 +132,10 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool { "updating", singleGroupStats.inPlaceUpdating) if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive { - klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name) + klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod)) if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred { - klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name) + klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod)) return true } } @@ -447,13 +447,13 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool { // If our QoS class is guaranteed, we can't change the resources without a restart // TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early? if pod.Status.QOSClass == apiv1.PodQOSGuaranteed { - klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass) + klog.V(4).InfoS("Can't resize pod in-place", "pod", klog.KObj(pod), "qosClass", pod.Status.QOSClass) return false } // If we're already resizing this pod, don't do it again if IsInPlaceUpdating(pod) { - klog.Warningf("Not resizing %s, already resizing it", pod.Name) + klog.V(4).InfoS("Not resizing pod again, because we are already resizing it", "pod", klog.KObj(pod)) return false } @@ -468,7 +468,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool { // TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy? for _, policy := range container.ResizePolicy { if policy.RestartPolicy != apiv1.NotRequired { - klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy) + klog.InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "restartPolicy", policy.RestartPolicy) // TODO(jkyros): is there something that prevents this from happening elsewhere in the API? if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever { klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy) @@ -481,7 +481,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool { // If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless if noRestartPoliciesPopulated { - klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name) + klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod)) } //TODO(jkyros): Come back and handle sidecar containers at some point since they're weird? @@ -489,7 +489,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool { // If we're pending, we can't in-place resize // TODO(jkyros): are we sure we can't? Should I just set this to "if running"? if pod.Status.Phase == apiv1.PodPending { - klog.V(4).Infof("Can't resize pending pod %s", pod.Name) + klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod)) return false } // This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map @@ -550,7 +550,8 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa for i, calculator := range e.patchCalculators { p, err := calculator.CalculatePatches(podToUpdate, vpa) if err != nil { - return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err) + klog.ErrorS(err, "failed to calculate resource patch for pod", "pod", klog.KObj(podToUpdate)) + return err } klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p) // TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice @@ -564,7 +565,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa if len(resourcePatches) > 0 { patch, err := json.Marshal(resourcePatches) if err != nil { - klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err) + klog.ErrorS(err, "Cannot marshal the patch %v", resourcePatches) return err } @@ -579,7 +580,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa if len(annotationPatches) > 0 { patch, err := json.Marshal(annotationPatches) if err != nil { - klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err) + klog.ErrorS(err, "Cannot marshal the patch %v", annotationPatches) return err } res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{}) @@ -611,7 +612,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa e.creatorToSingleGroupStatsMap[cr] = singleGroupStats } } else { - klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase) + klog.InfoS("Attempted to update", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase) } return nil diff --git a/vertical-pod-autoscaler/pkg/updater/logic/updater.go b/vertical-pod-autoscaler/pkg/updater/logic/updater.go index 43133acd5ab7..f020fd122b34 100644 --- a/vertical-pod-autoscaler/pkg/updater/logic/updater.go +++ b/vertical-pod-autoscaler/pkg/updater/logic/updater.go @@ -247,20 +247,20 @@ func (u *updater) RunOnce(ctx context.Context) { fallBackToEviction, err := u.AttemptInPlaceScalingIfPossible(ctx, vpaSize, vpa, pod, evictionLimiter, vpasWithInPlaceUpdatablePodsCounter, vpasWithInPlaceUpdatedPodsCounter) if err != nil { - klog.Warningf("error attemptng to scale pod %v in-place: %v", pod.Name, err) + klog.ErrorS(err, "error attemptng to scale pod in-place", "pod", klog.KObj(pod)) return } // If in-place scaling was possible, and it isn't stuck, then skip eviction if fallBackToEviction { // TODO(jkyros): this needs to be cleaner, but we absolutely need to make sure a disruptionless update doesn't "sneak through" if prioritizedPod.IsDisruptionless() { - klog.Infof("Not falling back to eviction, %v was supposed to be disruptionless", pod.Name) + klog.InfoS("Not falling back to eviction, pod was supposed to be disruptionless", "pod", klog.KObj(pod)) continue } else { - klog.V(4).Infof("Falling back to eviction for %s", pod.Name) + klog.V(4).InfoS("Falling back to eviction for pod", "pod", klog.KObj(pod)) } } else { - klog.Infof("Not falling back to eviction, because we don't have a recommendation yet? %v", pod.Name) + klog.InfoS("Not falling back to eviction, because we don't have a recommendation yet?", "pod", klog.KObj(pod)) continue } @@ -392,7 +392,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i // queue this for in-place before the VPA has made a recommendation. if !VpaRecommendationProvided(vpa) { - klog.V(4).Infof("VPA hasn't made a recommendation yet, we're early on %s for some reason", pod.Name) + klog.V(4).InfoS("VPA hasn't made a recommendation for the pod yet", "pod", klog.KObj(pod)) // TODO(jkyros): so we must have had some erroneous evictions before, but we were passing the test suite? But for now if I want to test // in-place I need it to not evict immediately if I can't in-place (because then it will never in-place) fallBackToEviction = false @@ -405,7 +405,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i withInPlaceUpdatable := false withInPlaceUpdated := false - klog.V(4).Infof("Looks like we might be able to in-place update %s..", pod.Name) + klog.V(4).InfoS("Looks like we might be able to in-place update pod...", "pod", klog.KObj(pod)) withInPlaceUpdatable = true // If I can't update if !evictionLimiter.CanInPlaceUpdate(pod) { @@ -413,14 +413,14 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i if !eviction.IsInPlaceUpdating(pod) { // But it's not in-place updating, something went wrong (e.g. the operation would change pods QoS) // fall back to eviction - klog.V(4).Infof("Can't in-place update pod %s, falling back to eviction, it might say no", pod.Name) + klog.V(4).InfoS("Can't in-place update pod, falling back to eviction, it might say no", "pod", klog.KObj(pod)) fallBackToEviction = true return } lastUpdateTime, exists := u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] if !exists { - klog.V(4).Infof("In-place update in progress for %s/%s, but no lastInPlaceUpdateTime found, setting it to now", pod.Namespace, pod.Name) + klog.V(4).InfoS("In-place update in progress for pod but no lastInPlaceUpdateTime found, setting it to now", "pod", klog.KObj(pod)) lastUpdateTime = time.Now() u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = lastUpdateTime } @@ -432,23 +432,23 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i switch pod.Status.Resize { case apiv1.PodResizeStatusDeferred: if time.Since(lastUpdateTime) > DeferredResizeUpdateTimeout { - klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", pod.Name) + klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", klog.KObj(pod)) fallBackToEviction = true } else { - klog.V(4).Infof("In-place update deferred for %s, NOT falling back to eviction yet", pod.Name) + klog.V(4).InfoS("In-place update deferred, NOT falling back to eviction yet", "pod", klog.KObj(pod)) } case apiv1.PodResizeStatusInProgress: if time.Since(lastUpdateTime) > InProgressResizeUpdateTimeout { - klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", pod.Name) + klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", klog.KObj(pod)) fallBackToEviction = true } else { - klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", pod.Name) + klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", klog.KObj(pod)) } case apiv1.PodResizeStatusInfeasible: - klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", pod.Name) + klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", klog.KObj(pod)) fallBackToEviction = true default: - klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", pod.Name) + klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", klog.KObj(pod)) fallBackToEviction = true } return @@ -458,15 +458,15 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i err = u.evictionRateLimiter.Wait(ctx) if err != nil { // TODO(jkyros): whether or not we fall back to eviction here probably depends on *why* we failed - klog.Warningf("updating pod %v failed: %v", pod.Name, err) + klog.ErrorS(err, "updating pod failed", "pod", klog.KObj(pod)) return } - klog.V(2).Infof("attempting to in-place update pod %v", pod.Name) + klog.V(2).InfoS("attempting to in-place update pod", "pod", klog.KObj(pod)) u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = time.Now() evictErr := evictionLimiter.InPlaceUpdate(pod, vpa, u.eventRecorder) if evictErr != nil { - klog.Warningf("updating pod %v failed: %v", pod.Name, evictErr) + klog.ErrorS(evictErr, "updating pod failed", "pod", klog.KObj(pod)) } else { // TODO(jkyros): come back later for stats withInPlaceUpdated = false diff --git a/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go b/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go index de9cc267f4f3..317c7d5dc7f3 100644 --- a/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go +++ b/vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go @@ -89,9 +89,9 @@ func (*defaultPriorityProcessor) GetUpdatePriority(pod *apiv1.Pod, vpa *vpa_type // It's okay if we're actually still resizing, but if we can't now or we're stuck, make sure the pod // is still in the list so we can evict it to go live on a fatter node or something if pod.Status.Resize == apiv1.PodResizeStatusDeferred || pod.Status.Resize == apiv1.PodResizeStatusInfeasible { - klog.V(4).Infof("Pod %s looks like it's stuck scaling up (%v state), leaving it in for eviction", pod.Name, pod.Status.Resize) + klog.V(4).InfoS("Pod looks like it's stuck scaling up, leaving it in for eviction", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize) } else { - klog.V(4).Infof("Pod %s is in the process of scaling up (%v state), leaving it in so we can see if it's taking too long", pod.Name, pod.Status.Resize) + klog.V(4).InfoS("Pod is in the process of scaling up, leaving it in so we can see if it's taking too long", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize) } } // I guess if it's not outside of compliance, it's probably okay it's stuck here? diff --git a/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go b/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go index aafd33d98cda..e87ba5bd1a3d 100644 --- a/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go +++ b/vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go @@ -138,15 +138,19 @@ func (calc *UpdatePriorityCalculator) AddPod(pod *apiv1.Pod, now time.Time) { // TODO(jkyros): do we need an in-place update threshold arg ? // If our recommendations are disruptionless, we can bypass the threshold limit if len(disruptionlessRecommendation.ContainerRecommendations) > 0 { - klog.V(2).Infof("Short-lived, but pod still accepted for DISRUPTIONLESS (%d/%d) in-place update %v/%v with priority %v", len(disruptionlessRecommendation.ContainerRecommendations), len(processedRecommendation.ContainerRecommendations), pod.Namespace, pod.Name, updatePriority.ResourceDiff) + klog.V(2).InfoS("Short-lived, but pod still accepted for disruptionless in-place update", + "pod", klog.KObj(pod), + "numContainers", len(pod.Spec.Containers), + "resourceDiff", updatePriority.ResourceDiff, + "fractionOfDisruptionlessRecommendations", len(disruptionlessRecommendation.ContainerRecommendations)/len(processedRecommendation.ContainerRecommendations), + ) updatePriority.Disruptionless = true calc.pods = append(calc.pods, PrioritizedPod{ pod: pod, priority: updatePriority, recommendation: disruptionlessRecommendation}) } else { - // if it's not disruptionless, we fallback to the Recreate conditions which already failed - // (quick oom, outside recommended range, long-lived + significant change), so don't update this pod + // we cannot perform this update disruption-free, so do not update this pod's resources klog.V(4).InfoS("Not updating a short-lived pod, request within recommended range", "pod", klog.KObj(pod)) } return @@ -176,7 +180,7 @@ func (calc *UpdatePriorityCalculator) GetSortedPrioritizedPods(admission PodEvic if admission.Admit(podPrio.pod, podPrio.recommendation) { result = append(result, &calc.pods[num]) } else { - klog.V(2).Infof("pod removed from update queue by PodEvictionAdmission: %v", podPrio.pod.Name) + klog.V(2).InfoS("Pod removed from update queue by PodEvictionAdmission", "pod", klog.KObj(podPrio.pod)) } } @@ -325,6 +329,7 @@ func (calc *UpdatePriorityCalculator) CalculateDisruptionFreeActions(pod *apiv1. resourceRestartPolicy := getRestartPolicyForResource(resource, container.ResizePolicy) // If we don't have one, that's probably bad if resourceRestartPolicy == nil { + klog.V(4).InfoS("Container does not have a resourceResizeRestartPolicy", "pod", klog.KObj(pod), "container", container.Name) continue } // If we do have one, and it's disruptive, then we know this won't work