Skip to content

Commit

Permalink
VPA: fix logs and cleanup TODOs according to review
Browse files Browse the repository at this point in the history
Signed-off-by: Max Cao <[email protected]>
  • Loading branch information
maxcao13 committed Jan 15, 2025
1 parent e406979 commit 53d4006
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,18 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
// If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
if IsInPlaceUpdating(pod) {
klog.V(4).InfoS("Pod disruption tolerance",
"pod", pod.Name,
"pod", klog.KObj(pod),
"running", singleGroupStats.running,
"configured", singleGroupStats.configured,
"tolerance", singleGroupStats.evictionTolerance,
"evicted", singleGroupStats.evicted,
"updating", singleGroupStats.inPlaceUpdating)

if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive {
klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name)
klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod))

if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred {
klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name)
klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod))
return true
}
}
Expand Down Expand Up @@ -447,13 +447,13 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
// If our QoS class is guaranteed, we can't change the resources without a restart
// TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
if pod.Status.QOSClass == apiv1.PodQOSGuaranteed {
klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass)
klog.V(4).InfoS("Can't resize pod in-place", "pod", klog.KObj(pod), "qosClass", pod.Status.QOSClass)
return false
}

// If we're already resizing this pod, don't do it again
if IsInPlaceUpdating(pod) {
klog.Warningf("Not resizing %s, already resizing it", pod.Name)
klog.V(4).InfoS("Not resizing pod again, because we are already resizing it", "pod", klog.KObj(pod))
return false
}

Expand All @@ -468,7 +468,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
// TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
for _, policy := range container.ResizePolicy {
if policy.RestartPolicy != apiv1.NotRequired {
klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy)
klog.InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "restartPolicy", policy.RestartPolicy)
// TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever {
klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy)
Expand All @@ -481,15 +481,15 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {

// If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
if noRestartPoliciesPopulated {
klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name)
klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod))
}

//TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
// If we're pending, we can't in-place resize
// TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
if pod.Status.Phase == apiv1.PodPending {
klog.V(4).Infof("Can't resize pending pod %s", pod.Name)
klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod))
return false
}
// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
Expand Down Expand Up @@ -550,7 +550,8 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
for i, calculator := range e.patchCalculators {
p, err := calculator.CalculatePatches(podToUpdate, vpa)
if err != nil {
return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
klog.ErrorS(err, "failed to calculate resource patch for pod", "pod", klog.KObj(podToUpdate))
return err
}
klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
Expand All @@ -564,7 +565,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
if len(resourcePatches) > 0 {
patch, err := json.Marshal(resourcePatches)
if err != nil {
klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
klog.ErrorS(err, "Cannot marshal the patch %v", resourcePatches)
return err
}

Expand All @@ -579,7 +580,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
if len(annotationPatches) > 0 {
patch, err := json.Marshal(annotationPatches)
if err != nil {
klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
klog.ErrorS(err, "Cannot marshal the patch %v", annotationPatches)
return err
}
res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
Expand Down Expand Up @@ -611,7 +612,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
e.creatorToSingleGroupStatsMap[cr] = singleGroupStats
}
} else {
klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase)
klog.InfoS("Attempted to update", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase)
}

return nil
Expand Down
34 changes: 17 additions & 17 deletions vertical-pod-autoscaler/pkg/updater/logic/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,20 +247,20 @@ func (u *updater) RunOnce(ctx context.Context) {

fallBackToEviction, err := u.AttemptInPlaceScalingIfPossible(ctx, vpaSize, vpa, pod, evictionLimiter, vpasWithInPlaceUpdatablePodsCounter, vpasWithInPlaceUpdatedPodsCounter)
if err != nil {
klog.Warningf("error attemptng to scale pod %v in-place: %v", pod.Name, err)
klog.ErrorS(err, "error attemptng to scale pod in-place", "pod", klog.KObj(pod))
return
}
// If in-place scaling was possible, and it isn't stuck, then skip eviction
if fallBackToEviction {
// TODO(jkyros): this needs to be cleaner, but we absolutely need to make sure a disruptionless update doesn't "sneak through"
if prioritizedPod.IsDisruptionless() {
klog.Infof("Not falling back to eviction, %v was supposed to be disruptionless", pod.Name)
klog.InfoS("Not falling back to eviction, pod was supposed to be disruptionless", "pod", klog.KObj(pod))
continue
} else {
klog.V(4).Infof("Falling back to eviction for %s", pod.Name)
klog.V(4).InfoS("Falling back to eviction for pod", "pod", klog.KObj(pod))
}
} else {
klog.Infof("Not falling back to eviction, because we don't have a recommendation yet? %v", pod.Name)
klog.InfoS("Not falling back to eviction, because we don't have a recommendation yet?", "pod", klog.KObj(pod))
continue
}

Expand Down Expand Up @@ -392,7 +392,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
// queue this for in-place before the VPA has made a recommendation.

if !VpaRecommendationProvided(vpa) {
klog.V(4).Infof("VPA hasn't made a recommendation yet, we're early on %s for some reason", pod.Name)
klog.V(4).InfoS("VPA hasn't made a recommendation for the pod yet", "pod", klog.KObj(pod))
// TODO(jkyros): so we must have had some erroneous evictions before, but we were passing the test suite? But for now if I want to test
// in-place I need it to not evict immediately if I can't in-place (because then it will never in-place)
fallBackToEviction = false
Expand All @@ -405,22 +405,22 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
withInPlaceUpdatable := false
withInPlaceUpdated := false

klog.V(4).Infof("Looks like we might be able to in-place update %s..", pod.Name)
klog.V(4).InfoS("Looks like we might be able to in-place update pod...", "pod", klog.KObj(pod))
withInPlaceUpdatable = true
// If I can't update
if !evictionLimiter.CanInPlaceUpdate(pod) {
// If we are already updating, wait for the next loop to progress
if !eviction.IsInPlaceUpdating(pod) {
// But it's not in-place updating, something went wrong (e.g. the operation would change pods QoS)
// fall back to eviction
klog.V(4).Infof("Can't in-place update pod %s, falling back to eviction, it might say no", pod.Name)
klog.V(4).InfoS("Can't in-place update pod, falling back to eviction, it might say no", "pod", klog.KObj(pod))
fallBackToEviction = true
return
}

lastUpdateTime, exists := u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)]
if !exists {
klog.V(4).Infof("In-place update in progress for %s/%s, but no lastInPlaceUpdateTime found, setting it to now", pod.Namespace, pod.Name)
klog.V(4).InfoS("In-place update in progress for pod but no lastInPlaceUpdateTime found, setting it to now", "pod", klog.KObj(pod))
lastUpdateTime = time.Now()
u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = lastUpdateTime
}
Expand All @@ -432,23 +432,23 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
switch pod.Status.Resize {
case apiv1.PodResizeStatusDeferred:
if time.Since(lastUpdateTime) > DeferredResizeUpdateTimeout {
klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", pod.Name)
klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", klog.KObj(pod))
fallBackToEviction = true
} else {
klog.V(4).Infof("In-place update deferred for %s, NOT falling back to eviction yet", pod.Name)
klog.V(4).InfoS("In-place update deferred, NOT falling back to eviction yet", "pod", klog.KObj(pod))
}
case apiv1.PodResizeStatusInProgress:
if time.Since(lastUpdateTime) > InProgressResizeUpdateTimeout {
klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", pod.Name)
klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", klog.KObj(pod))
fallBackToEviction = true
} else {
klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", pod.Name)
klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", klog.KObj(pod))
}
case apiv1.PodResizeStatusInfeasible:
klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", pod.Name)
klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", klog.KObj(pod))
fallBackToEviction = true
default:
klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", pod.Name)
klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", klog.KObj(pod))
fallBackToEviction = true
}
return
Expand All @@ -458,15 +458,15 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
err = u.evictionRateLimiter.Wait(ctx)
if err != nil {
// TODO(jkyros): whether or not we fall back to eviction here probably depends on *why* we failed
klog.Warningf("updating pod %v failed: %v", pod.Name, err)
klog.ErrorS(err, "updating pod failed", "pod", klog.KObj(pod))
return
}

klog.V(2).Infof("attempting to in-place update pod %v", pod.Name)
klog.V(2).InfoS("attempting to in-place update pod", "pod", klog.KObj(pod))
u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = time.Now()
evictErr := evictionLimiter.InPlaceUpdate(pod, vpa, u.eventRecorder)
if evictErr != nil {
klog.Warningf("updating pod %v failed: %v", pod.Name, evictErr)
klog.ErrorS(evictErr, "updating pod failed", "pod", klog.KObj(pod))
} else {
// TODO(jkyros): come back later for stats
withInPlaceUpdated = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ func (*defaultPriorityProcessor) GetUpdatePriority(pod *apiv1.Pod, vpa *vpa_type
// It's okay if we're actually still resizing, but if we can't now or we're stuck, make sure the pod
// is still in the list so we can evict it to go live on a fatter node or something
if pod.Status.Resize == apiv1.PodResizeStatusDeferred || pod.Status.Resize == apiv1.PodResizeStatusInfeasible {
klog.V(4).Infof("Pod %s looks like it's stuck scaling up (%v state), leaving it in for eviction", pod.Name, pod.Status.Resize)
klog.V(4).InfoS("Pod looks like it's stuck scaling up, leaving it in for eviction", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
} else {
klog.V(4).Infof("Pod %s is in the process of scaling up (%v state), leaving it in so we can see if it's taking too long", pod.Name, pod.Status.Resize)
klog.V(4).InfoS("Pod is in the process of scaling up, leaving it in so we can see if it's taking too long", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
}
}
// I guess if it's not outside of compliance, it's probably okay it's stuck here?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,15 +138,19 @@ func (calc *UpdatePriorityCalculator) AddPod(pod *apiv1.Pod, now time.Time) {
// TODO(jkyros): do we need an in-place update threshold arg ?
// If our recommendations are disruptionless, we can bypass the threshold limit
if len(disruptionlessRecommendation.ContainerRecommendations) > 0 {
klog.V(2).Infof("Short-lived, but pod still accepted for DISRUPTIONLESS (%d/%d) in-place update %v/%v with priority %v", len(disruptionlessRecommendation.ContainerRecommendations), len(processedRecommendation.ContainerRecommendations), pod.Namespace, pod.Name, updatePriority.ResourceDiff)
klog.V(2).InfoS("Short-lived, but pod still accepted for disruptionless in-place update",
"pod", klog.KObj(pod),
"numContainers", len(pod.Spec.Containers),
"resourceDiff", updatePriority.ResourceDiff,
"fractionOfDisruptionlessRecommendations", len(disruptionlessRecommendation.ContainerRecommendations)/len(processedRecommendation.ContainerRecommendations),
)
updatePriority.Disruptionless = true
calc.pods = append(calc.pods, PrioritizedPod{
pod: pod,
priority: updatePriority,
recommendation: disruptionlessRecommendation})
} else {
// if it's not disruptionless, we fallback to the Recreate conditions which already failed
// (quick oom, outside recommended range, long-lived + significant change), so don't update this pod
// we cannot perform this update disruption-free, so do not update this pod's resources
klog.V(4).InfoS("Not updating a short-lived pod, request within recommended range", "pod", klog.KObj(pod))
}
return
Expand Down Expand Up @@ -176,7 +180,7 @@ func (calc *UpdatePriorityCalculator) GetSortedPrioritizedPods(admission PodEvic
if admission.Admit(podPrio.pod, podPrio.recommendation) {
result = append(result, &calc.pods[num])
} else {
klog.V(2).Infof("pod removed from update queue by PodEvictionAdmission: %v", podPrio.pod.Name)
klog.V(2).InfoS("Pod removed from update queue by PodEvictionAdmission", "pod", klog.KObj(podPrio.pod))
}
}

Expand Down Expand Up @@ -325,6 +329,7 @@ func (calc *UpdatePriorityCalculator) CalculateDisruptionFreeActions(pod *apiv1.
resourceRestartPolicy := getRestartPolicyForResource(resource, container.ResizePolicy)
// If we don't have one, that's probably bad
if resourceRestartPolicy == nil {
klog.V(4).InfoS("Container does not have a resourceResizeRestartPolicy", "pod", klog.KObj(pod), "container", container.Name)
continue
}
// If we do have one, and it's disruptive, then we know this won't work
Expand Down

0 comments on commit 53d4006

Please sign in to comment.