Skip to content

Commit

Permalink
Update progressConditions even when no progressDeadlineSeconds config…
Browse files Browse the repository at this point in the history
…ured(infinite deadline) (#762)
  • Loading branch information
himanshu-kun authored Dec 15, 2022
1 parent d98eddf commit 251dcd1
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 45 deletions.
6 changes: 3 additions & 3 deletions pkg/apis/machine/v1alpha1/machinedeployment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ type MachineDeploymentSpec struct {
// process failed MachineDeployments and a condition with a ProgressDeadlineExceeded
// reason will be surfaced in the MachineDeployment status. Note that progress will
// not be estimated during the time a MachineDeployment is paused. This is not set
// by default.
// by default, which is treated as infinite deadline.
// +optional
ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"`
}
Expand Down Expand Up @@ -219,8 +219,8 @@ const (

// Progressing means the MachineDeployment is progressing. Progress for a MachineDeployment is
// considered when a new machine set is created or adopted, and when new machines scale
// up or old machines scale down. Progress is not estimated for paused MachineDeployments or
// when progressDeadlineSeconds is not specified.
// up or old machines scale down. Progress is not estimated for paused MachineDeployments. It is also updated
// if progressDeadlineSeconds is not specified(treated as infinite deadline), in which case it would never be updated to "false".
MachineDeploymentProgressing MachineDeploymentConditionType = "Progressing"

// ReplicaFailure is added in a MachineDeployment when one of its machines fails to be created
Expand Down
13 changes: 5 additions & 8 deletions pkg/controller/deployment_progress.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,13 @@ import (
func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1.MachineSet, newIS *v1alpha1.MachineSet, d *v1alpha1.MachineDeployment) error {
newStatus := calculateDeploymentStatus(allISs, newIS, d)

// If there is no progressDeadlineSeconds set, remove any Progressing condition.
if d.Spec.ProgressDeadlineSeconds == nil {
RemoveMachineDeploymentCondition(&newStatus, v1alpha1.MachineDeploymentProgressing)
}

// If there is only one machine set that is active then that means we are not running
// a new rollout and this is a resync where we don't need to estimate any progress.
// In such a case, we should simply not estimate any progress for this deployment.
currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
isCompleteDeployment := newStatus.Replicas == newStatus.UpdatedReplicas && currentCond != nil && currentCond.Reason == NewISAvailableReason
// Check for progress only if there is a progress deadline set and the latest rollout
// hasn't completed yet.
if d.Spec.ProgressDeadlineSeconds != nil && !isCompleteDeployment {
// Check for progress only if the latest rollout hasn't completed yet.
if !isCompleteDeployment {
switch {
case MachineDeploymentComplete(d, &newStatus):
// Update the deployment conditions with a message for the new machine set that
Expand Down Expand Up @@ -87,6 +81,8 @@ func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1.
}
SetMachineDeploymentCondition(&newStatus, *condition)

// the case -> (mark `Progressing` condition reason as `ProgressDeadlineExceeded` only when non-infinite progressDeadline is set)
// is already handled inside `MachineDeploymentTimedOut()`
case MachineDeploymentTimedOut(d, &newStatus):
// Update the deployment with a timeout condition. If the condition already exists,
// we ignore this update.
Expand Down Expand Up @@ -161,6 +157,7 @@ func (dc *controller) getReplicaFailures(allISs []*v1alpha1.MachineSet, newIS *v
// requeueStuckDeployment checks whether the provided deployment needs to be synced for a progress
// check. It returns the time after the deployment will be requeued for the progress check, 0 if it
// will be requeued now, or -1 if it does not need to be requeued.
// This method is just to make sure that as soon as `progressDeadlineSeconds` timeout happens we update the `Progressing` Condition.
func (dc *controller) requeueStuckMachineDeployment(d *v1alpha1.MachineDeployment, newStatus v1alpha1.MachineDeploymentStatus) time.Duration {
currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
// Can't estimate progress if there is no deadline in the spec or progressing condition in the current status.
Expand Down
67 changes: 33 additions & 34 deletions pkg/controller/deployment_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,6 @@ func (dc *controller) sync(ctx context.Context, d *v1alpha1.MachineDeployment, i
// These conditions are needed so that we won't accidentally report lack of progress for resumed deployments
// that were paused for longer than progressDeadlineSeconds.
func (dc *controller) checkPausedConditions(ctx context.Context, d *v1alpha1.MachineDeployment) error {
if d.Spec.ProgressDeadlineSeconds == nil {
return nil
}
cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
if cond != nil && cond.Reason == TimedOutReason {
// If we have reported lack of progress, do not overwrite it with a paused condition.
Expand Down Expand Up @@ -265,11 +262,8 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD

// Should use the revision in existingNewRS's annotation, since it set by before
needsUpdate := SetMachineDeploymentRevision(d, isCopy.Annotations[RevisionAnnotation])
// If no other Progressing condition has been recorded and we need to estimate the progress
// of this deployment then it is likely that old users started caring about progress. In that
// case we need to take into account the first time we noticed their new machine set.
cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
if d.Spec.ProgressDeadlineSeconds != nil && cond == nil {
if cond == nil {
msg := fmt.Sprintf("Found new machine set %q", isCopy.Name)
condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, FoundNewISReason, msg)
SetMachineDeploymentCondition(&d.Status, *condition)
Expand Down Expand Up @@ -340,7 +334,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
createdIS, err := dc.controlMachineClient.MachineSets(newIS.Namespace).Create(ctx, &newIS, metav1.CreateOptions{})
switch {
// We may end up hitting this due to a slow cache or a fast resync of the Deployment.
// Fetch a copy of the ReplicaSet. If its machineTemplateSpec is semantically deep equal
// Fetch a copy of the ReplicaSet. If its owner is our deployment and its machineTemplateSpec is semantically deep equal
// with the machineTemplateSpec of the Deployment, then that is our new ReplicaSet. Otherwise,
// this is a hash collision and we need to increment the collisionCount field in the
// status of the Deployment and try the creation again.
Expand All @@ -350,37 +344,42 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
if isErr != nil {
return nil, isErr
}
isEqual := EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template)

// bought these changes from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L240-L243)
controllerRef := metav1.GetControllerOf(is)
if controllerRef != nil && controllerRef.UID == d.UID && EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template) {
// Pass through the matching ReplicaSet as the new ReplicaSet.
createdIS = is
err = nil
break
}

// Matching ReplicaSet is not equal - increment the collisionCount in the DeploymentStatus
// and requeue the Deployment.
if !isEqual {
if d.Status.CollisionCount == nil {
d.Status.CollisionCount = new(int32)
}
preCollisionCount := *d.Status.CollisionCount
*d.Status.CollisionCount++
// Update the collisionCount for the Deployment and let it requeue by returning the original
// error.
_, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
if dErr == nil {
klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount)
}
return nil, err
if d.Status.CollisionCount == nil {
d.Status.CollisionCount = new(int32)
}
preCollisionCount := *d.Status.CollisionCount
*d.Status.CollisionCount++
// Update the collisionCount for the Deployment and let it requeue by returning the original
// error.
_, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
if dErr == nil {
klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount)
}
// Pass through the matching ReplicaSet as the new ReplicaSet.
createdIS = is
err = nil
return nil, err
// bought this case from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L260-L262)
case errors.HasStatusCause(err, v1.NamespaceTerminatingCause):
// if the namespace is terminating, all subsequent creates will fail and we can safely do nothing
return nil, err
case err != nil:
msg := fmt.Sprintf("Failed to create new machine set %q: %v", newIS.Name, err)
if d.Spec.ProgressDeadlineSeconds != nil {
cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg)
SetMachineDeploymentCondition(&d.Status, *cond)
// We don't really care about this error at this point, since we have a bigger issue to report.
// TODO: Identify which errors are permanent and switch DeploymentIsFailed to take into account
// these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568
_, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
}
cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg)
SetMachineDeploymentCondition(&d.Status, *cond)
// We don't really care about this error at this point, since we have a bigger issue to report.
// TODO: Identify which errors are permanent and switch DeploymentIsFailed to take into account
// these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568
_, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
dc.recorder.Eventf(d, v1.EventTypeWarning, FailedISCreateReason, msg)
return nil, err
}
Expand All @@ -389,7 +388,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
}

needsUpdate := SetMachineDeploymentRevision(d, newRevision)
if !alreadyExists && d.Spec.ProgressDeadlineSeconds != nil {
if !alreadyExists {
msg := fmt.Sprintf("Created new machine set %q", createdIS.Name)
condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, NewMachineSetReason, msg)
SetMachineDeploymentCondition(&d.Status, *condition)
Expand Down
2 changes: 2 additions & 0 deletions pkg/controller/machine_safety.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(ctx context.Context) er
lowerThreshold := higherThreshold - c.safetyOptions.SafetyDown

machineDeployments := c.getMachineDeploymentsForMachineSet(machineSet)
// if we have a parent machineDeployment than we use a different higherThreshold and lowerThreshold,
// keeping in mind the rolling update scenario, as we won't want to freeze during a normal rolling update.
if len(machineDeployments) >= 1 {
machineDeployment := machineDeployments[0]
if machineDeployment != nil {
Expand Down

0 comments on commit 251dcd1

Please sign in to comment.