diff --git a/pkg/apis/machine/v1alpha1/machinedeployment_types.go b/pkg/apis/machine/v1alpha1/machinedeployment_types.go index 41243c625..1161e7180 100644 --- a/pkg/apis/machine/v1alpha1/machinedeployment_types.go +++ b/pkg/apis/machine/v1alpha1/machinedeployment_types.go @@ -98,7 +98,7 @@ type MachineDeploymentSpec struct { // process failed MachineDeployments and a condition with a ProgressDeadlineExceeded // reason will be surfaced in the MachineDeployment status. Note that progress will // not be estimated during the time a MachineDeployment is paused. This is not set - // by default. + // by default, which is treated as infinite deadline. // +optional ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"` } @@ -219,8 +219,8 @@ const ( // Progressing means the MachineDeployment is progressing. Progress for a MachineDeployment is // considered when a new machine set is created or adopted, and when new machines scale - // up or old machines scale down. Progress is not estimated for paused MachineDeployments or - // when progressDeadlineSeconds is not specified. + // up or old machines scale down. Progress is not estimated for paused MachineDeployments. It is also updated + // if progressDeadlineSeconds is not specified(treated as infinite deadline), in which case it would never be updated to "false". MachineDeploymentProgressing MachineDeploymentConditionType = "Progressing" // ReplicaFailure is added in a MachineDeployment when one of its machines fails to be created diff --git a/pkg/controller/deployment_progress.go b/pkg/controller/deployment_progress.go index 38e4d40b4..6dc6ea1af 100644 --- a/pkg/controller/deployment_progress.go +++ b/pkg/controller/deployment_progress.go @@ -40,19 +40,13 @@ import ( func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1.MachineSet, newIS *v1alpha1.MachineSet, d *v1alpha1.MachineDeployment) error { newStatus := calculateDeploymentStatus(allISs, newIS, d) - // If there is no progressDeadlineSeconds set, remove any Progressing condition. - if d.Spec.ProgressDeadlineSeconds == nil { - RemoveMachineDeploymentCondition(&newStatus, v1alpha1.MachineDeploymentProgressing) - } - // If there is only one machine set that is active then that means we are not running // a new rollout and this is a resync where we don't need to estimate any progress. // In such a case, we should simply not estimate any progress for this deployment. currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing) isCompleteDeployment := newStatus.Replicas == newStatus.UpdatedReplicas && currentCond != nil && currentCond.Reason == NewISAvailableReason - // Check for progress only if there is a progress deadline set and the latest rollout - // hasn't completed yet. - if d.Spec.ProgressDeadlineSeconds != nil && !isCompleteDeployment { + // Check for progress only if the latest rollout hasn't completed yet. + if !isCompleteDeployment { switch { case MachineDeploymentComplete(d, &newStatus): // Update the deployment conditions with a message for the new machine set that @@ -87,6 +81,8 @@ func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1. } SetMachineDeploymentCondition(&newStatus, *condition) + // the case -> (mark `Progressing` condition reason as `ProgressDeadlineExceeded` only when non-infinite progressDeadline is set) + // is already handled inside `MachineDeploymentTimedOut()` case MachineDeploymentTimedOut(d, &newStatus): // Update the deployment with a timeout condition. If the condition already exists, // we ignore this update. @@ -161,6 +157,7 @@ func (dc *controller) getReplicaFailures(allISs []*v1alpha1.MachineSet, newIS *v // requeueStuckDeployment checks whether the provided deployment needs to be synced for a progress // check. It returns the time after the deployment will be requeued for the progress check, 0 if it // will be requeued now, or -1 if it does not need to be requeued. +// This method is just to make sure that as soon as `progressDeadlineSeconds` timeout happens we update the `Progressing` Condition. func (dc *controller) requeueStuckMachineDeployment(d *v1alpha1.MachineDeployment, newStatus v1alpha1.MachineDeploymentStatus) time.Duration { currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing) // Can't estimate progress if there is no deadline in the spec or progressing condition in the current status. diff --git a/pkg/controller/deployment_sync.go b/pkg/controller/deployment_sync.go index 4eed4e139..4c9c3a30a 100644 --- a/pkg/controller/deployment_sync.go +++ b/pkg/controller/deployment_sync.go @@ -79,9 +79,6 @@ func (dc *controller) sync(ctx context.Context, d *v1alpha1.MachineDeployment, i // These conditions are needed so that we won't accidentally report lack of progress for resumed deployments // that were paused for longer than progressDeadlineSeconds. func (dc *controller) checkPausedConditions(ctx context.Context, d *v1alpha1.MachineDeployment) error { - if d.Spec.ProgressDeadlineSeconds == nil { - return nil - } cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing) if cond != nil && cond.Reason == TimedOutReason { // If we have reported lack of progress, do not overwrite it with a paused condition. @@ -265,11 +262,8 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD // Should use the revision in existingNewRS's annotation, since it set by before needsUpdate := SetMachineDeploymentRevision(d, isCopy.Annotations[RevisionAnnotation]) - // If no other Progressing condition has been recorded and we need to estimate the progress - // of this deployment then it is likely that old users started caring about progress. In that - // case we need to take into account the first time we noticed their new machine set. cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing) - if d.Spec.ProgressDeadlineSeconds != nil && cond == nil { + if cond == nil { msg := fmt.Sprintf("Found new machine set %q", isCopy.Name) condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, FoundNewISReason, msg) SetMachineDeploymentCondition(&d.Status, *condition) @@ -340,7 +334,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD createdIS, err := dc.controlMachineClient.MachineSets(newIS.Namespace).Create(ctx, &newIS, metav1.CreateOptions{}) switch { // We may end up hitting this due to a slow cache or a fast resync of the Deployment. - // Fetch a copy of the ReplicaSet. If its machineTemplateSpec is semantically deep equal + // Fetch a copy of the ReplicaSet. If its owner is our deployment and its machineTemplateSpec is semantically deep equal // with the machineTemplateSpec of the Deployment, then that is our new ReplicaSet. Otherwise, // this is a hash collision and we need to increment the collisionCount field in the // status of the Deployment and try the creation again. @@ -350,37 +344,42 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD if isErr != nil { return nil, isErr } - isEqual := EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template) + + // bought these changes from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L240-L243) + controllerRef := metav1.GetControllerOf(is) + if controllerRef != nil && controllerRef.UID == d.UID && EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template) { + // Pass through the matching ReplicaSet as the new ReplicaSet. + createdIS = is + err = nil + break + } // Matching ReplicaSet is not equal - increment the collisionCount in the DeploymentStatus // and requeue the Deployment. - if !isEqual { - if d.Status.CollisionCount == nil { - d.Status.CollisionCount = new(int32) - } - preCollisionCount := *d.Status.CollisionCount - *d.Status.CollisionCount++ - // Update the collisionCount for the Deployment and let it requeue by returning the original - // error. - _, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{}) - if dErr == nil { - klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount) - } - return nil, err + if d.Status.CollisionCount == nil { + d.Status.CollisionCount = new(int32) + } + preCollisionCount := *d.Status.CollisionCount + *d.Status.CollisionCount++ + // Update the collisionCount for the Deployment and let it requeue by returning the original + // error. + _, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{}) + if dErr == nil { + klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount) } - // Pass through the matching ReplicaSet as the new ReplicaSet. - createdIS = is - err = nil + return nil, err + // bought this case from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L260-L262) + case errors.HasStatusCause(err, v1.NamespaceTerminatingCause): + // if the namespace is terminating, all subsequent creates will fail and we can safely do nothing + return nil, err case err != nil: msg := fmt.Sprintf("Failed to create new machine set %q: %v", newIS.Name, err) - if d.Spec.ProgressDeadlineSeconds != nil { - cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg) - SetMachineDeploymentCondition(&d.Status, *cond) - // We don't really care about this error at this point, since we have a bigger issue to report. - // TODO: Identify which errors are permanent and switch DeploymentIsFailed to take into account - // these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568 - _, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{}) - } + cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg) + SetMachineDeploymentCondition(&d.Status, *cond) + // We don't really care about this error at this point, since we have a bigger issue to report. + // TODO: Identify which errors are permanent and switch DeploymentIsFailed to take into account + // these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568 + _, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{}) dc.recorder.Eventf(d, v1.EventTypeWarning, FailedISCreateReason, msg) return nil, err } @@ -389,7 +388,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD } needsUpdate := SetMachineDeploymentRevision(d, newRevision) - if !alreadyExists && d.Spec.ProgressDeadlineSeconds != nil { + if !alreadyExists { msg := fmt.Sprintf("Created new machine set %q", createdIS.Name) condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, NewMachineSetReason, msg) SetMachineDeploymentCondition(&d.Status, *condition) diff --git a/pkg/controller/machine_safety.go b/pkg/controller/machine_safety.go index d1852d4cb..3cf98bb9b 100644 --- a/pkg/controller/machine_safety.go +++ b/pkg/controller/machine_safety.go @@ -358,6 +358,8 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(ctx context.Context) er lowerThreshold := higherThreshold - c.safetyOptions.SafetyDown machineDeployments := c.getMachineDeploymentsForMachineSet(machineSet) + // if we have a parent machineDeployment than we use a different higherThreshold and lowerThreshold, + // keeping in mind the rolling update scenario, as we won't want to freeze during a normal rolling update. if len(machineDeployments) >= 1 { machineDeployment := machineDeployments[0] if machineDeployment != nil {