From 9ae1a7264de2d2078e8780dc46b832287243994f Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Fri, 30 Aug 2019 11:24:24 -0700 Subject: [PATCH 01/13] rollback Pravega Cluster Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 54 ++++++ pkg/apis/pravega/v1alpha1/status.go | 66 ++++++- .../pravegacluster_controller.go | 27 ++- pkg/controller/pravegacluster/upgrade.go | 179 +++++++++++++++--- 4 files changed, 294 insertions(+), 32 deletions(-) create mode 100644 doc/rollback-cluster.md diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md new file mode 100644 index 000000000..f95ddeaf6 --- /dev/null +++ b/doc/rollback-cluster.md @@ -0,0 +1,54 @@ +# Pravega cluster rollback + +This document shows how to automated rollback of Pravega cluster is implemented by the operator while preserving the cluster's state and data whenever possible. + +## Failing an Upgrade + +An Upgrade can fail because of following reasons: + +1. Incorrect configuration (wrong quota, permissions, limit ranges) +2. Network issues (ImagePullError) +3. K8s Cluster Issues. +4. Application issues (Application runtime misconfiguration or code bugs) + +An upgrade failure can manifest through a Pod to staying in `Pending` state forever or having continous restarts after moving to Running state (CrashLoopBackOff). +Here we try to fail-fast by explicitly checking for some common causes for upgrade failure like `ErrImagePull` and failing the upgrade if any pod faces this issue during upgrade. +We also have a time threshold within which deployment to a pod should complete. If it does not, then we fail the upgrade. +To indicate upgrade failure we set the folling condition on PravegaCluster status: + +``` +ClusterConditionType: Error +Status: True +Reason: UpgradeFailed +Message:
+``` + +## Rollback Trigger + +Rollback is triggered by the PravegaCluster moving to `ClusterConditionType: Error` with `Reason:UpgradeFailed` state. + +## Rollback implementation +When Rollback is started cluster moves into ClusterCondition `RollbackInProgress`. +Once Rollback completes this condition is set to false. + +A new data structure is added clusterStatus to maintain all previous cluster versions . +``` +VersionHistory []string `json:"versionHistory,omitempty"` +``` +For now, operator would support automated rollback only to the previous cluster version. Later, operator may support rollback to any supported previous version, but this would need to be invoked manually. + +Rollback involves moving each component in the cluster back to its previous cluster version. As in case of upgrade, operator would rollback one component at a time and one pod at a time. + +If Rollback completes successfully, cluster state would be set back to `PodsReady` which would mean the cluster is now in a stable state. +If Rollback Fails, cluster would move to state `RollbackError` and User would be prompted for manual intervention. + + + + + + + +## Pending tasks + + +## Prerequisites diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index 7596e7371..c39318de7 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -11,6 +11,8 @@ package v1alpha1 import ( + "fmt" + "log" "time" corev1 "k8s.io/api/core/v1" @@ -21,6 +23,7 @@ type ClusterConditionType string const ( ClusterConditionPodsReady ClusterConditionType = "PodsReady" ClusterConditionUpgrading = "Upgrading" + ClusterConditionRollback = "RollbackInProgress" ClusterConditionError = "Error" ) @@ -36,6 +39,8 @@ type ClusterStatus struct { // If the cluster is not upgrading, TargetVersion is empty. TargetVersion string `json:"targetVersion,omitempty"` + VersionHistory []string `json:"versionHistory,omitempty"` + // Replicas is the number of desired replicas in the cluster Replicas int32 `json:"replicas"` @@ -78,7 +83,8 @@ type ClusterCondition struct { LastTransitionTime string `json:"lastTransitionTime,omitempty"` } -func (ps *ClusterStatus) InitConditions() { +func (ps *ClusterStatus) Init() { + // Initialise conditions conditionTypes := []ClusterConditionType{ ClusterConditionPodsReady, ClusterConditionUpgrading, @@ -90,6 +96,12 @@ func (ps *ClusterStatus) InitConditions() { ps.setClusterCondition(*c) } } + + // Set current cluster version in version history, + // so if the first upgrade fails we can rollback to this version + if ps.VersionHistory == nil && ps.CurrentVersion != "" { + ps.VersionHistory = []string{ps.CurrentVersion} + } } func (ps *ClusterStatus) SetPodsReadyConditionTrue() { @@ -112,6 +124,22 @@ func (ps *ClusterStatus) SetUpgradingConditionFalse() { ps.setClusterCondition(*c) } +func (ps *ClusterStatus) SetComponent(componentName string) { + _, upgradeCondition := ps.GetClusterCondition(ClusterConditionUpgrading) + if upgradeCondition != nil && upgradeCondition.Status == corev1.ConditionTrue { + message := fmt.Sprintf("Upgrading component: %s", componentName) + c := newClusterCondition(ClusterConditionUpgrading, corev1.ConditionTrue, "", message) + ps.setClusterCondition(*c) + } + + _, rollbackCondition := ps.GetClusterCondition(ClusterConditionRollback) + if rollbackCondition != nil && rollbackCondition.Status == corev1.ConditionTrue { + message := fmt.Sprintf("Rollingback component: %s", componentName) + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, "", message) + ps.setClusterCondition(*c) + } +} + func (ps *ClusterStatus) SetErrorConditionTrue(reason, message string) { c := newClusterCondition(ClusterConditionError, corev1.ConditionTrue, reason, message) ps.setClusterCondition(*c) @@ -122,6 +150,15 @@ func (ps *ClusterStatus) SetErrorConditionFalse() { ps.setClusterCondition(*c) } +func (ps *ClusterStatus) SetRollbackConditionTrue() { + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, "", "") + ps.setClusterCondition(*c) +} +func (ps *ClusterStatus) SetRollbackConditionFalse() { + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionFalse, "", "") + ps.setClusterCondition(*c) +} + func newClusterCondition(condType ClusterConditionType, status corev1.ConditionStatus, reason, message string) *ClusterCondition { return &ClusterCondition{ Type: condType, @@ -165,3 +202,30 @@ func (ps *ClusterStatus) setClusterCondition(newCondition ClusterCondition) { ps.Conditions[position] = *existingCondition } + +func (ps *ClusterStatus) AddToVersionHistory(version string) { + lastIndex := len(ps.VersionHistory) - 1 + if version != "" && ps.VersionHistory[lastIndex] != version { + ps.VersionHistory = append(ps.VersionHistory, version) + log.Printf("Updating version history adding version %v", version) + } +} + +func (ps *ClusterStatus) GetLastVersion() (previousVersion string, err error) { + if ps.VersionHistory == nil { + return "", fmt.Errorf("ERROR: No previous cluster version found") + } + len := len(ps.VersionHistory) + return ps.VersionHistory[len-1], nil +} + +func (ps *ClusterStatus) HasUpgradeFailed() bool { + _, errorCondition := ps.GetClusterCondition(ClusterConditionError) + if errorCondition == nil { + return false + } + if errorCondition.Status == corev1.ConditionTrue && errorCondition.Reason == "UpgradeFailed" { + return true + } + return false +} diff --git a/pkg/controller/pravegacluster/pravegacluster_controller.go b/pkg/controller/pravegacluster/pravegacluster_controller.go index ce2fbcfbf..b8fdde0f4 100644 --- a/pkg/controller/pravegacluster/pravegacluster_controller.go +++ b/pkg/controller/pravegacluster/pravegacluster_controller.go @@ -138,11 +138,18 @@ func (r *ReconcilePravegaCluster) run(p *pravegav1alpha1.PravegaCluster) (err er return fmt.Errorf("failed to sync cluster size: %v", err) } + // Upgrade err = r.syncClusterVersion(p) if err != nil { return fmt.Errorf("failed to sync cluster version: %v", err) } + // Rollback + err = r.rollbackFailedUpgrade(p) + if err != nil { + return fmt.Errorf("Rollback attempt failed: %v", err) + } + err = r.reconcileClusterStatus(p) if err != nil { return fmt.Errorf("failed to reconcile cluster status: %v", err) @@ -151,6 +158,7 @@ func (r *ReconcilePravegaCluster) run(p *pravegav1alpha1.PravegaCluster) (err er } func (r *ReconcilePravegaCluster) deployCluster(p *pravegav1alpha1.PravegaCluster) (err error) { + err = r.deployBookie(p) if err != nil { log.Printf("failed to deploy bookie: %v", err) @@ -168,10 +176,12 @@ func (r *ReconcilePravegaCluster) deployCluster(p *pravegav1alpha1.PravegaCluste log.Printf("failed to deploy segment store: %v", err) return err } + return nil } func (r *ReconcilePravegaCluster) deployController(p *pravegav1alpha1.PravegaCluster) (err error) { + pdb := pravega.MakeControllerPodDisruptionBudget(p) controllerutil.SetControllerReference(p, pdb, r.scheme) err = r.client.Create(context.TODO(), pdb) @@ -251,6 +261,7 @@ func (r *ReconcilePravegaCluster) deploySegmentStore(p *pravegav1alpha1.PravegaC } func (r *ReconcilePravegaCluster) deployBookie(p *pravegav1alpha1.PravegaCluster) (err error) { + headlessService := pravega.MakeBookieHeadlessService(p) controllerutil.SetControllerReference(p, headlessService, r.scheme) err = r.client.Create(context.TODO(), headlessService) @@ -439,7 +450,7 @@ func (r *ReconcilePravegaCluster) syncStatefulSetPvc(sts *appsv1.StatefulSet) er func (r *ReconcilePravegaCluster) reconcileClusterStatus(p *pravegav1alpha1.PravegaCluster) error { - p.Status.InitConditions() + p.Status.Init() expectedSize := util.GetClusterExpectedSize(p) listOps := &client.ListOptions{ @@ -483,3 +494,17 @@ func (r *ReconcilePravegaCluster) reconcileClusterStatus(p *pravegav1alpha1.Prav } return nil } + +func (r *ReconcilePravegaCluster) rollbackFailedUpgrade(p *pravegav1alpha1.PravegaCluster) error { + if p.Status.HasUpgradeFailed() { + // start rollback to previous version + previousVersion, err := p.Status.GetLastVersion() + if err != nil { + return fmt.Errorf("Error retrieving previous cluster version %v", err) + } + log.Printf("Rolling back to last cluster version %v", previousVersion) + //Rollback cluster to previous version + return r.rollbackClusterVersion(p, previousVersion) + } + return nil +} diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 9e450b8d8..910583988 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -21,6 +21,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -30,6 +31,7 @@ type componentSyncVersionFun struct { fun func(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) } +// upgrade func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaCluster) (err error) { defer func() { r.client.Status().Update(context.TODO(), p) @@ -48,7 +50,6 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC if upgradeCondition.Status == corev1.ConditionTrue { // Upgrade process already in progress - if p.Status.TargetVersion == "" { log.Println("syncing to an unknown version: cancelling upgrade process") return r.clearUpgradeStatus(p) @@ -59,17 +60,24 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC return r.clearUpgradeStatus(p) } - if err := r.syncComponentsVersion(p); err != nil { - log.Printf("error syncing cluster version, need manual intervention. %v", err) - // TODO: Trigger roll back to previous version + syncCompleted, err := r.syncComponentsVersion(p) + if err != nil { + log.Printf("error syncing cluster version, upgrade failed. %v", err) p.Status.SetErrorConditionTrue("UpgradeFailed", err.Error()) r.clearUpgradeStatus(p) + return err + } + + if syncCompleted { + // All component versions have been synced + p.Status.AddToVersionHistory(p.Status.CurrentVersion) + p.Status.CurrentVersion = p.Status.TargetVersion + log.Printf("Upgrade completed for all pravega components.") } return nil } // No upgrade in progress - if p.Spec.Version == p.Status.CurrentVersion { // No intention to upgrade return nil @@ -88,14 +96,13 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC // The upgrade process will start on the next reconciliation p.Status.TargetVersion = p.Spec.Version p.Status.SetUpgradingConditionTrue() - return nil } func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaCluster) (err error) { p.Status.SetUpgradingConditionFalse() p.Status.TargetVersion = "" - // need to deep copy the status struct, otherwise it will be overridden + // need to deep copy the status struct, otherwise it will be overwritten // when updating the CR below status := p.Status.DeepCopy() @@ -108,9 +115,60 @@ func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaC return nil } -func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster) (err error) { - var synced bool +func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.PravegaCluster, version string) (err error) { + _, rollbackCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + if rollbackCondition == nil { + // We're in the first iteration for Rollback + // Add Rollback Condition to Cluster Status + p.Status.SetRollbackConditionTrue() + p.Spec.Version = version + p.Status.TargetVersion = p.Spec.Version + updateErr := r.client.Status().Update(context.TODO(), p) + if updateErr != nil { + log.Printf("Error updating cluster: %v", updateErr.Error()) + return fmt.Errorf("Error updating cluster status. %v", updateErr) + } + } + + syncCompleted, err := r.syncComponentsVersion(p) + if err != nil { + // error rolling back, set appropriate status and ask for manual intervention + p.Status.SetErrorConditionTrue("RollbackFailed", err.Error()) + r.clearRollbackStatus(p) + log.Printf("Error rolling back to cluster version %v. Reason: %v", version, err) + return err + } + + if syncCompleted { + // All component versions have been synced + p.Status.AddToVersionHistory(p.Status.CurrentVersion) + p.Status.CurrentVersion = p.Status.TargetVersion + // Set Error/UpgradeFailed Condition to 'false', so rollback is not triggered again + p.Status.SetErrorConditionFalse() + r.clearRollbackStatus(p) + log.Printf("Rollback completed for all pravega components.") + } + return nil +} + +func (r *ReconcilePravegaCluster) clearRollbackStatus(p *pravegav1alpha1.PravegaCluster) (err error) { + log.Printf("clearRollbackStatus") + p.Status.SetRollbackConditionFalse() + p.Status.TargetVersion = "" + // need to deep copy the status struct, otherwise it will be overwritten + // when updating the CR below + status := p.Status.DeepCopy() + p.Spec.Version = p.Status.CurrentVersion + if err := r.client.Update(context.TODO(), p); err != nil { + return err + } + + p.Status = *status + return nil +} + +func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { for _, component := range []componentSyncVersionFun{ componentSyncVersionFun{ name: "bookkeeper", @@ -125,9 +183,9 @@ func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.Prave fun: r.syncControllerVersion, }, } { - synced, err = component.fun(p) + synced, err := component.fun(p) if err != nil { - return fmt.Errorf("failed to sync %s version. %s", component.name, err) + return false, fmt.Errorf("failed to sync %s version. %s", component.name, err) } if synced { @@ -135,13 +193,10 @@ func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.Prave } else { // component version sync is still in progress // Do not continue with the next component until this one is done - return nil + return false, nil } } - - // All component versions have been synced - p.Status.CurrentVersion = p.Status.TargetVersion - return nil + return true, nil } func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { @@ -158,6 +213,7 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave } if deploy.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.SetComponent(name) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating deployment (%s) pod template image to '%s'", deploy.Name, targetImage) @@ -178,7 +234,26 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave // Check whether the upgrade is in progress or has completed if deploy.Status.UpdatedReplicas != deploy.Status.Replicas || deploy.Status.UpdatedReplicas != deploy.Status.ReadyReplicas { - // Update still in progress + // Upgrade still in progress + pods, err := r.getDeployPodsWithVersion(deploy, p.Status.TargetVersion) + if err != nil { + return false, err + } + + for _, pod := range pods { + //TODO: find out a more reliable way to determine if a pod is having issues + if pod.Status.ContainerStatuses[0].RestartCount > 1 { + return false, fmt.Errorf("pod %s is restarting", pod.Name) + } + + if !util.IsPodReady(pod) { + // At least one updated pod is still not ready + if pod.Status.ContainerStatuses[0].State.Waiting != nil && pod.Status.ContainerStatuses[0].State.Waiting.Reason == "ImagePullBackOff" { + return false, fmt.Errorf("pod %s update failed because of %s", pod.Name, pod.Status.ContainerStatuses[0].State.Waiting.Reason) + } + return false, nil + } + } return false, nil } @@ -201,6 +276,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.SetComponent(name) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -215,19 +291,47 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra } // Pod template already updated - log.Printf("statefulset (%s) status: %d updated, %d ready, %d target", sts.Name, sts.Status.UpdatedReplicas, sts.Status.ReadyReplicas, sts.Status.Replicas) // Check whether the upgrade is in progress or has completed - if sts.Status.UpdatedReplicas != sts.Status.Replicas || - sts.Status.UpdatedReplicas != sts.Status.ReadyReplicas { - // Upgrade still in progress - return false, nil + if sts.Status.UpdatedReplicas == sts.Status.Replicas && + sts.Status.UpdatedReplicas == sts.Status.ReadyReplicas { + // StatefulSet upgrade completed + // TODO: wait until there is no under replicated ledger + // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated + return true, nil } - // StatefulSet upgrade completed - return true, nil + // Upgrade still in progress + // If all replicas are ready, upgrade an old pod + + ready, err := r.checkUpdatedPods(sts, p.Status.TargetVersion) + if err != nil { + // Abort if there is any errors with the updated pods + return false, err + } + + if ready { + pod, err := r.getOneOutdatedPod(sts, p.Status.TargetVersion) + if err != nil { + return false, err + } + + if pod == nil { + return false, fmt.Errorf("could not obtain outdated pod") + } + + log.Infof("updating pod: %s", pod.Name) + + err = r.client.Delete(context.TODO(), pod) + if err != nil { + return false, err + } + } + + // wait until the next reconcile iteration + return false, nil } func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { @@ -244,6 +348,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.SetComponent(name) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -257,7 +362,6 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } // Pod template already updated - log.Printf("statefulset (%s) status: %d updated, %d ready, %d target", sts.Name, sts.Status.UpdatedReplicas, sts.Status.ReadyReplicas, sts.Status.Replicas) @@ -289,7 +393,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave return false, fmt.Errorf("could not obtain outdated pod") } - log.Infof("upgrading pod: %s", pod.Name) + log.Infof("updating pod: %s", pod.Name) err = r.client.Delete(context.TODO(), pod) if err != nil { @@ -302,7 +406,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } func (r *ReconcilePravegaCluster) checkUpdatedPods(sts *appsv1.StatefulSet, version string) (bool, error) { - pods, err := r.getPodsWithVersion(sts, version) + pods, err := r.getStsPodsWithVersion(sts, version) if err != nil { return false, err } @@ -351,7 +455,7 @@ func (r *ReconcilePravegaCluster) getOneOutdatedPod(sts *appsv1.StatefulSet, ver return nil, nil } -func (r *ReconcilePravegaCluster) getPodsWithVersion(sts *appsv1.StatefulSet, version string) ([]*corev1.Pod, error) { +func (r *ReconcilePravegaCluster) getStsPodsWithVersion(sts *appsv1.StatefulSet, version string) ([]*corev1.Pod, error) { selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ MatchLabels: sts.Spec.Template.Labels, }) @@ -359,12 +463,27 @@ func (r *ReconcilePravegaCluster) getPodsWithVersion(sts *appsv1.StatefulSet, ve return nil, fmt.Errorf("failed to convert label selector: %v", err) } + return r.getPodsWithVersion(selector, sts.Namespace, version) +} + +func (r *ReconcilePravegaCluster) getDeployPodsWithVersion(deploy *appsv1.Deployment, version string) ([]*corev1.Pod, error) { + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: deploy.Spec.Template.Labels, + }) + if err != nil { + return nil, fmt.Errorf("failed to convert label selector: %v", err) + } + + return r.getPodsWithVersion(selector, deploy.Namespace, version) +} + +func (r *ReconcilePravegaCluster) getPodsWithVersion(selector labels.Selector, namespace string, version string) ([]*corev1.Pod, error) { podList := &corev1.PodList{} podlistOps := &client.ListOptions{ - Namespace: sts.Namespace, + Namespace: namespace, LabelSelector: selector, } - err = r.client.List(context.TODO(), podlistOps, podList) + err := r.client.List(context.TODO(), podlistOps, podList) if err != nil { return nil, err } From 7b1bfc2a0feca69c00c4d05cdfc611832ac33234 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Wed, 4 Sep 2019 04:41:37 -0700 Subject: [PATCH 02/13] updated documentation and other minor changes Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 29 +++++++++----- pkg/apis/pravega/v1alpha1/status.go | 14 ++----- .../pravega/pravega_segmentstore.go | 2 +- pkg/controller/pravegacluster/upgrade.go | 40 +++++++++++++------ 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index f95ddeaf6..90d13de87 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -11,10 +11,11 @@ An Upgrade can fail because of following reasons: 3. K8s Cluster Issues. 4. Application issues (Application runtime misconfiguration or code bugs) -An upgrade failure can manifest through a Pod to staying in `Pending` state forever or having continous restarts after moving to Running state (CrashLoopBackOff). -Here we try to fail-fast by explicitly checking for some common causes for upgrade failure like `ErrImagePull` and failing the upgrade if any pod faces this issue during upgrade. -We also have a time threshold within which deployment to a pod should complete. If it does not, then we fail the upgrade. -To indicate upgrade failure we set the folling condition on PravegaCluster status: +An upgrade failure can manifest through a Pod to staying in `Pending` state forever or continuously restarting or crashing (CrashLoopBackOff). +A component deployment failure needs to be tracked and mapped to "Upgrade Failure" for Pravega Cluster. +Here we try to fail-fast by explicitly checking for some common causes for deployment failure like image pull errors or CrashLoopBackOff State and failing the upgrade if any pod runs into this state during upgrade. + +The following Pravega Cluster Status Condition indicates an Upgrade Failure: ``` ClusterConditionType: Error @@ -25,21 +26,29 @@ Message:
## Rollback Trigger -Rollback is triggered by the PravegaCluster moving to `ClusterConditionType: Error` with `Reason:UpgradeFailed` state. +A Rollback is triggered by Upgrade Failure condition i.e the Cluster moving to +`ClusterConditionType: Error` and +`Reason:UpgradeFailed` state. -## Rollback implementation +## Rollback Implementation When Rollback is started cluster moves into ClusterCondition `RollbackInProgress`. Once Rollback completes this condition is set to false. +The order in which the components are rolled back is the following: + +1. BookKeeper +2. Pravega Segment Store +3. Pravega Controller -A new data structure is added clusterStatus to maintain all previous cluster versions . +A new field `versionHistory` has been added to Pravega ClusterStatus to maintain history of previous cluster versions . ``` VersionHistory []string `json:"versionHistory,omitempty"` ``` -For now, operator would support automated rollback only to the previous cluster version. Later, operator may support rollback to any supported previous version, but this would need to be invoked manually. +Currently, operator only supports automated rollback to the previous cluster version. +Later, rollback to any other previous version(s), may be supported. -Rollback involves moving each component in the cluster back to its previous cluster version. As in case of upgrade, operator would rollback one component at a time and one pod at a time. +Rollback involves moving all components in the cluster back to the previous cluster version. As in case of upgrade, operator would rollback one component at a time and one pod at a time to maintain HA. -If Rollback completes successfully, cluster state would be set back to `PodsReady` which would mean the cluster is now in a stable state. +If Rollback completes successfully, cluster state goes back to `PodsReady` which would mean the cluster is now in a stable state. If Rollback Fails, cluster would move to state `RollbackError` and User would be prompted for manual intervention. diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index c39318de7..c1ee2547d 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -124,18 +124,12 @@ func (ps *ClusterStatus) SetUpgradingConditionFalse() { ps.setClusterCondition(*c) } -func (ps *ClusterStatus) SetComponent(componentName string) { +func (ps *ClusterStatus) SetUpgradedReplicasForComponent(componentName string, updatedReplicas int32, totalReplicas int32) { _, upgradeCondition := ps.GetClusterCondition(ClusterConditionUpgrading) if upgradeCondition != nil && upgradeCondition.Status == corev1.ConditionTrue { - message := fmt.Sprintf("Upgrading component: %s", componentName) - c := newClusterCondition(ClusterConditionUpgrading, corev1.ConditionTrue, "", message) - ps.setClusterCondition(*c) - } - - _, rollbackCondition := ps.GetClusterCondition(ClusterConditionRollback) - if rollbackCondition != nil && rollbackCondition.Status == corev1.ConditionTrue { - message := fmt.Sprintf("Rollingback component: %s", componentName) - c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, "", message) + reason := fmt.Sprintf("Upgrading component: %s", componentName) + message := fmt.Sprintf("Upgraded Replicas: %v, Total Replicas: %v", updatedReplicas, totalReplicas) + c := newClusterCondition(ClusterConditionUpgrading, corev1.ConditionTrue, reason, message) ps.setClusterCondition(*c) } } diff --git a/pkg/controller/pravega/pravega_segmentstore.go b/pkg/controller/pravega/pravega_segmentstore.go index bb9c41458..b7fd0b41d 100644 --- a/pkg/controller/pravega/pravega_segmentstore.go +++ b/pkg/controller/pravega/pravega_segmentstore.go @@ -44,7 +44,7 @@ func MakeSegmentStoreStatefulSet(pravegaCluster *api.PravegaCluster) *appsv1.Sta Replicas: &pravegaCluster.Spec.Pravega.SegmentStoreReplicas, PodManagementPolicy: appsv1.OrderedReadyPodManagement, UpdateStrategy: appsv1.StatefulSetUpdateStrategy{ - Type: appsv1.RollingUpdateStatefulSetStrategyType, + Type: appsv1.OnDeleteStatefulSetStrategyType, }, Template: MakeSegmentStorePodTemplate(pravegaCluster), Selector: &metav1.LabelSelector{ diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 910583988..b05fee615 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -213,7 +213,7 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave } if deploy.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetComponent(name) + p.Status.SetUpgradedReplicasForComponent(name, deploy.Status.UpdatedReplicas, deploy.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating deployment (%s) pod template image to '%s'", deploy.Name, targetImage) @@ -276,7 +276,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetComponent(name) + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -300,12 +300,21 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra // StatefulSet upgrade completed // TODO: wait until there is no under replicated ledger // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + err = r.client.Update(context.TODO(), sts) + if err != nil { + return false, err + } return true, nil } // Upgrade still in progress // If all replicas are ready, upgrade an old pod - + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + err = r.client.Update(context.TODO(), sts) + if err != nil { + return false, err + } ready, err := r.checkUpdatedPods(sts, p.Status.TargetVersion) if err != nil { // Abort if there is any errors with the updated pods @@ -348,7 +357,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetComponent(name) + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -371,12 +380,21 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave // StatefulSet upgrade completed // TODO: wait until there is no under replicated ledger // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + err = r.client.Update(context.TODO(), sts) + if err != nil { + return false, err + } return true, nil } // Upgrade still in progress // If all replicas are ready, upgrade an old pod - + p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + err = r.client.Update(context.TODO(), sts) + if err != nil { + return false, err + } ready, err := r.checkUpdatedPods(sts, p.Status.TargetVersion) if err != nil { // Abort if there is any errors with the updated pods @@ -400,7 +418,6 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave return false, err } } - // wait until the next reconcile iteration return false, nil } @@ -412,15 +429,12 @@ func (r *ReconcilePravegaCluster) checkUpdatedPods(sts *appsv1.StatefulSet, vers } for _, pod := range pods { - //TODO: find out a more reliable way to determine if a pod is having issues - if pod.Status.ContainerStatuses[0].RestartCount > 1 { - return false, fmt.Errorf("pod %s is restarting", pod.Name) - } - if !util.IsPodReady(pod) { // At least one updated pod is still not ready - if pod.Status.ContainerStatuses[0].State.Waiting != nil && pod.Status.ContainerStatuses[0].State.Waiting.Reason == "ImagePullBackOff" { - return false, fmt.Errorf("pod %s update failed because of %s", pod.Name, pod.Status.ContainerStatuses[0].State.Waiting.Reason) + if pod.Status.ContainerStatuses[0].State.Waiting != nil { + if pod.Status.ContainerStatuses[0].State.Waiting.Reason == "ImagePullBackOff" || pod.Status.ContainerStatuses[0].State.Waiting.Reason == "CrashLoopBackOff" { + return false, fmt.Errorf("pod %s update failed because of %s", pod.Name, pod.Status.ContainerStatuses[0].State.Waiting.Reason) + } } return false, nil } From 8295a8bc160a2e33169f11abe6c7a19f87f04c35 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Fri, 6 Sep 2019 03:05:12 -0700 Subject: [PATCH 03/13] spec.Version is not set and rollback trigger is manual Signed-off-by: pbelgundi --- pkg/apis/pravega/v1alpha1/status.go | 27 ++++++++++++++----- .../pravegacluster_controller.go | 14 ++++++---- pkg/controller/pravegacluster/upgrade.go | 17 ++++++------ 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index c1ee2547d..6c39bd7c6 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -205,15 +205,12 @@ func (ps *ClusterStatus) AddToVersionHistory(version string) { } } -func (ps *ClusterStatus) GetLastVersion() (previousVersion string, err error) { - if ps.VersionHistory == nil { - return "", fmt.Errorf("ERROR: No previous cluster version found") - } +func (ps *ClusterStatus) GetLastVersion() (previousVersion string) { len := len(ps.VersionHistory) - return ps.VersionHistory[len-1], nil + return ps.VersionHistory[len-1] } -func (ps *ClusterStatus) HasUpgradeFailed() bool { +func (ps *ClusterStatus) IsClusterInUpgradeFailedState() bool { _, errorCondition := ps.GetClusterCondition(ClusterConditionError) if errorCondition == nil { return false @@ -223,3 +220,21 @@ func (ps *ClusterStatus) HasUpgradeFailed() bool { } return false } + +func (ps *ClusterStatus) IsClusterInUpgradeFailedOrRollbackState() bool { + if ps.IsClusterInUpgradeFailedState() || ps.IsClusterInRollbackState() { + return true + } + return false +} + +func (ps *ClusterStatus) IsClusterInRollbackState() bool { + _, rollbackCondition := ps.GetClusterCondition(ClusterConditionRollback) + if rollbackCondition == nil { + return false + } + if rollbackCondition.Status == corev1.ConditionTrue { + return true + } + return false +} diff --git a/pkg/controller/pravegacluster/pravegacluster_controller.go b/pkg/controller/pravegacluster/pravegacluster_controller.go index b8fdde0f4..abffb6e3f 100644 --- a/pkg/controller/pravegacluster/pravegacluster_controller.go +++ b/pkg/controller/pravegacluster/pravegacluster_controller.go @@ -496,15 +496,19 @@ func (r *ReconcilePravegaCluster) reconcileClusterStatus(p *pravegav1alpha1.Prav } func (r *ReconcilePravegaCluster) rollbackFailedUpgrade(p *pravegav1alpha1.PravegaCluster) error { - if p.Status.HasUpgradeFailed() { + if r.isRollbackTriggered(p) { // start rollback to previous version - previousVersion, err := p.Status.GetLastVersion() - if err != nil { - return fmt.Errorf("Error retrieving previous cluster version %v", err) - } + previousVersion := p.Status.GetLastVersion() log.Printf("Rolling back to last cluster version %v", previousVersion) //Rollback cluster to previous version return r.rollbackClusterVersion(p, previousVersion) } return nil } + +func (r *ReconcilePravegaCluster) isRollbackTriggered(p *pravegav1alpha1.PravegaCluster) bool { + if p.Status.IsClusterInUpgradeFailedState() && p.Spec.Version == p.Status.GetLastVersion() { + return true + } + return false +} diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index b05fee615..57b9e2c86 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -37,6 +37,10 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC r.client.Status().Update(context.TODO(), p) }() + if p.Status.IsClusterInUpgradeFailedOrRollbackState() { + return nil + } + _, upgradeCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) _, readyCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionPodsReady) @@ -91,7 +95,6 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC // Need to sync cluster versions log.Printf("syncing cluster version from %s to %s", p.Status.CurrentVersion, p.Spec.Version) - // Setting target version and condition. // The upgrade process will start on the next reconciliation p.Status.TargetVersion = p.Spec.Version @@ -106,7 +109,6 @@ func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaC // when updating the CR below status := p.Status.DeepCopy() - p.Spec.Version = p.Status.CurrentVersion if err := r.client.Update(context.TODO(), p); err != nil { return err } @@ -117,14 +119,15 @@ func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaC func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.PravegaCluster, version string) (err error) { _, rollbackCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) - if rollbackCondition == nil { + if rollbackCondition == nil || rollbackCondition.Status != corev1.ConditionTrue { // We're in the first iteration for Rollback // Add Rollback Condition to Cluster Status + log.Printf("Updating Target Version to %v", version) + p.Status.TargetVersion = version p.Status.SetRollbackConditionTrue() - p.Spec.Version = version - p.Status.TargetVersion = p.Spec.Version updateErr := r.client.Status().Update(context.TODO(), p) if updateErr != nil { + p.Status.SetRollbackConditionFalse() log.Printf("Error updating cluster: %v", updateErr.Error()) return fmt.Errorf("Error updating cluster status. %v", updateErr) } @@ -141,12 +144,11 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav if syncCompleted { // All component versions have been synced - p.Status.AddToVersionHistory(p.Status.CurrentVersion) p.Status.CurrentVersion = p.Status.TargetVersion // Set Error/UpgradeFailed Condition to 'false', so rollback is not triggered again p.Status.SetErrorConditionFalse() r.clearRollbackStatus(p) - log.Printf("Rollback completed for all pravega components.") + log.Printf("Rollback to version %v completed for all pravega components.", version) } return nil } @@ -159,7 +161,6 @@ func (r *ReconcilePravegaCluster) clearRollbackStatus(p *pravegav1alpha1.Pravega // when updating the CR below status := p.Status.DeepCopy() - p.Spec.Version = p.Status.CurrentVersion if err := r.client.Update(context.TODO(), p); err != nil { return err } From d5f72b1ae04feddf7c0dd26e9cf1ecc717501d84 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Fri, 6 Sep 2019 03:59:45 -0700 Subject: [PATCH 04/13] updated documentation Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 68 ++++++++++++++++++------ pkg/apis/pravega/v1alpha1/status.go | 6 +-- pkg/controller/pravegacluster/upgrade.go | 14 ++--- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index 90d13de87..bfeeeb739 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -1,8 +1,9 @@ -# Pravega cluster rollback +# Pravega Cluster Rollback -This document shows how to automated rollback of Pravega cluster is implemented by the operator while preserving the cluster's state and data whenever possible. +This document details how manual rollback can be triggered after a Pravega cluster upgrade fails. +Note that a rollback can be triggered only on Upgrade Failure. -## Failing an Upgrade +## Upgrade Failure An Upgrade can fail because of following reasons: @@ -15,7 +16,7 @@ An upgrade failure can manifest through a Pod to staying in `Pending` state fore A component deployment failure needs to be tracked and mapped to "Upgrade Failure" for Pravega Cluster. Here we try to fail-fast by explicitly checking for some common causes for deployment failure like image pull errors or CrashLoopBackOff State and failing the upgrade if any pod runs into this state during upgrade. -The following Pravega Cluster Status Condition indicates an Upgrade Failure: +The following Pravega Cluster Status Condition indicates a Failed Upgrade: ``` ClusterConditionType: Error @@ -23,12 +24,46 @@ Status: True Reason: UpgradeFailed Message:
``` +After an Upgrade Failure output of `kubectl describe pravegacluster pravega` would look like this: -## Rollback Trigger +``` +$> kubectl describe pravegacluster pravega +. . . +Spec: +. . . +Version: 0.6.0-2252.b6f6512 +. . . +Status: +. . . +Conditions: + Last Transition Time: 2019-09-06T09:00:13Z + Last Update Time: 2019-09-06T09:00:13Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-06T08:58:40Z + Last Update Time: 2019-09-06T08:58:40Z + Status: False + Type: PodsReady + Last Transition Time: 2019-09-06T09:00:13Z + Last Update Time: 2019-09-06T09:00:13Z + Message: failed to sync segmentstore version. pod pravega-pravega-segmentstore-0 update failed because of ImagePullBackOff + Reason: UpgradeFailed + Status: True + Type: Error + . . . + Current Version: 0.6.0-2239.6e24df7 +. . . +Version History: + 0.6.0-2239.6e24df7 +``` +where `0.6.0-2252.b6f6512` is the version we tried upgrading to and `0.6.0-2239.6e24df7` is the version before upgrade. + +## Manual Rollback Trigger +A Rollback is triggered when a Pravgea Cluster is `UpgradeFailed` Error State and a user manually updates in the PravegaCluster spec the version field to point to cluster version prior to upgrade. -A Rollback is triggered by Upgrade Failure condition i.e the Cluster moving to -`ClusterConditionType: Error` and -`Reason:UpgradeFailed` state. +Note: +1. Rollback to any other cluster version (other than the previousVersion) is not supported at this point. +2. Changing the cluster spec version to the previous cluster version, when cluster is not in `UpgradeFailed` state, will trigger a rollback, but will be treated like a regular upgrade. ## Rollback Implementation When Rollback is started cluster moves into ClusterCondition `RollbackInProgress`. @@ -39,17 +74,20 @@ The order in which the components are rolled back is the following: 2. Pravega Segment Store 3. Pravega Controller -A new field `versionHistory` has been added to Pravega ClusterStatus to maintain history of previous cluster versions . -``` -VersionHistory []string `json:"versionHistory,omitempty"` -``` -Currently, operator only supports automated rollback to the previous cluster version. -Later, rollback to any other previous version(s), may be supported. +A new field `versionHistory` has been added to Pravega ClusterStatus to maintain history of upgrades. Rollback involves moving all components in the cluster back to the previous cluster version. As in case of upgrade, operator would rollback one component at a time and one pod at a time to maintain HA. If Rollback completes successfully, cluster state goes back to `PodsReady` which would mean the cluster is now in a stable state. -If Rollback Fails, cluster would move to state `RollbackError` and User would be prompted for manual intervention. +If Rollback Fails, the cluster would move to state `RollbackFailed` indicated by this cluster condition: +``` +ClusterConditionType: Error +Status: True +Reason: RollbackFailed +Message:
+``` + +Manual intervention would be needed for resolving this. diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index 6c39bd7c6..cae530a62 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -124,11 +124,11 @@ func (ps *ClusterStatus) SetUpgradingConditionFalse() { ps.setClusterCondition(*c) } -func (ps *ClusterStatus) SetUpgradedReplicasForComponent(componentName string, updatedReplicas int32, totalReplicas int32) { +func (ps *ClusterStatus) SetUpdatedReplicasForComponent(componentName string, updatedReplicas int32, totalReplicas int32) { _, upgradeCondition := ps.GetClusterCondition(ClusterConditionUpgrading) if upgradeCondition != nil && upgradeCondition.Status == corev1.ConditionTrue { - reason := fmt.Sprintf("Upgrading component: %s", componentName) - message := fmt.Sprintf("Upgraded Replicas: %v, Total Replicas: %v", updatedReplicas, totalReplicas) + reason := fmt.Sprintf("Updating component: %s", componentName) + message := fmt.Sprintf("Updated Replicas: %v, Total Replicas: %v", updatedReplicas, totalReplicas) c := newClusterCondition(ClusterConditionUpgrading, corev1.ConditionTrue, reason, message) ps.setClusterCondition(*c) } diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 57b9e2c86..c814e1240 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -214,7 +214,7 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave } if deploy.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetUpgradedReplicasForComponent(name, deploy.Status.UpdatedReplicas, deploy.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, deploy.Status.UpdatedReplicas, deploy.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating deployment (%s) pod template image to '%s'", deploy.Name, targetImage) @@ -277,7 +277,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -301,7 +301,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra // StatefulSet upgrade completed // TODO: wait until there is no under replicated ledger // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) err = r.client.Update(context.TODO(), sts) if err != nil { return false, err @@ -311,7 +311,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra // Upgrade still in progress // If all replicas are ready, upgrade an old pod - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) err = r.client.Update(context.TODO(), sts) if err != nil { return false, err @@ -358,7 +358,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -381,7 +381,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave // StatefulSet upgrade completed // TODO: wait until there is no under replicated ledger // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) err = r.client.Update(context.TODO(), sts) if err != nil { return false, err @@ -391,7 +391,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave // Upgrade still in progress // If all replicas are ready, upgrade an old pod - p.Status.SetUpgradedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) + p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) err = r.client.Update(context.TODO(), sts) if err != nil { return false, err From d4592c5bf950aeae9c13c885f4220d4a23729583 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Mon, 9 Sep 2019 23:48:51 -0700 Subject: [PATCH 05/13] reversed order of components in rollback Signed-off-by: pbelgundi --- pkg/controller/pravegacluster/upgrade.go | 53 +++++++++++++++++------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index c814e1240..0b67622fa 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -64,7 +64,7 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC return r.clearUpgradeStatus(p) } - syncCompleted, err := r.syncComponentsVersion(p) + syncCompleted, err := r.syncComponentsVersion(p, false) if err != nil { log.Printf("error syncing cluster version, upgrade failed. %v", err) p.Status.SetErrorConditionTrue("UpgradeFailed", err.Error()) @@ -133,7 +133,7 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav } } - syncCompleted, err := r.syncComponentsVersion(p) + syncCompleted, err := r.syncComponentsVersion(p, true) if err != nil { // error rolling back, set appropriate status and ask for manual intervention p.Status.SetErrorConditionTrue("RollbackFailed", err.Error()) @@ -169,8 +169,8 @@ func (r *ReconcilePravegaCluster) clearRollbackStatus(p *pravegav1alpha1.Pravega return nil } -func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { - for _, component := range []componentSyncVersionFun{ +func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster, isRollback bool) (synced bool, err error) { + componentSyncFuncs := []componentSyncVersionFun{ componentSyncVersionFun{ name: "bookkeeper", fun: r.syncBookkeeperVersion, @@ -183,20 +183,43 @@ func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.Prave name: "controller", fun: r.syncControllerVersion, }, - } { - synced, err := component.fun(p) - if err != nil { - return false, fmt.Errorf("failed to sync %s version. %s", component.name, err) - } + } - if synced { - log.Printf("%s version sync has been completed", component.name) - } else { - // component version sync is still in progress - // Do not continue with the next component until this one is done - return false, nil + if isRollback { + startIndex := len(componentSyncFuncs) - 1 + // update components in reverse order + for i := startIndex; i >= 0; i-- { + log.Printf("Rollback: syncing component %v", i) + component := componentSyncFuncs[i] + synced, err := r.syncComponent(component, p) + if !synced { + return synced, err + } + } + } else { + for _, component := range componentSyncFuncs { + synced, err := r.syncComponent(component, p) + if !synced { + return synced, err + } } } + log.Printf("Version sync completed for all components.") + return true, nil +} + +func (r *ReconcilePravegaCluster) syncComponent(component componentSyncVersionFun, p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { + isSyncComplete, err := component.fun(p) + if err != nil { + return false, fmt.Errorf("failed to sync %s version. %s", component.name, err) + } + + if !isSyncComplete { + // component version sync is still in progress + // Do not continue with the next component until this one is done + return false, nil + } + log.Printf("%s version sync has been completed", component.name) return true, nil } From af8e96de8fe65aa6973aef8d868ad796f1873923 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Mon, 9 Sep 2019 23:54:48 -0700 Subject: [PATCH 06/13] updated documentation Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index bfeeeb739..ed3454aa8 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -68,11 +68,11 @@ Note: ## Rollback Implementation When Rollback is started cluster moves into ClusterCondition `RollbackInProgress`. Once Rollback completes this condition is set to false. -The order in which the components are rolled back is the following: +The order in which the components are rolled back the reverse as upgrade : -1. BookKeeper +1. Pravega Controller 2. Pravega Segment Store -3. Pravega Controller +3. BookKeeper A new field `versionHistory` has been added to Pravega ClusterStatus to maintain history of upgrades. From 2d79b88fa89e704d36e89e1d55457fab81181778 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Tue, 17 Sep 2019 07:01:36 -0700 Subject: [PATCH 07/13] unit tests for rollback Signed-off-by: pbelgundi --- pkg/controller/pravegacluster/upgrade.go | 2 + pkg/controller/pravegacluster/upgrade_test.go | 134 +++++++++++++++++- 2 files changed, 131 insertions(+), 5 deletions(-) diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 08aa285b2..02a808ba1 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -41,6 +41,7 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC // we cannot upgrade if cluster is in UpgradeFailed or Rollback state if p.Status.IsClusterInUpgradeFailedOrRollbackState() { + log.Println("Can't upgrade a Cluster in Upgrade Failed State. Please rollback first.") return nil } @@ -135,6 +136,7 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav log.Printf("Error updating cluster: %v", updateErr.Error()) return fmt.Errorf("Error updating cluster status. %v", updateErr) } + return nil } syncCompleted, err := r.syncComponentsVersion(p) diff --git a/pkg/controller/pravegacluster/upgrade_test.go b/pkg/controller/pravegacluster/upgrade_test.go index 8e7d4a098..c145a77ba 100644 --- a/pkg/controller/pravegacluster/upgrade_test.go +++ b/pkg/controller/pravegacluster/upgrade_test.go @@ -12,6 +12,7 @@ package pravegacluster import ( "context" + "fmt" "testing" "github.com/pravega/pravega-operator/pkg/apis/pravega/v1alpha1" @@ -36,7 +37,7 @@ func TestUpgrade(t *testing.T) { RunSpecs(t, "Pravega cluster") } -var _ = Describe("Pravega Cluster", func() { +var _ = Describe("Pravega Cluster Version Sync", func() { const ( Name = "example" Namespace = "default" @@ -47,7 +48,7 @@ var _ = Describe("Pravega Cluster", func() { r *ReconcilePravegaCluster ) - Context("Upgrade", func() { + var _ = Describe("Upgrade Test", func() { var ( req reconcile.Request p *v1alpha1.PravegaCluster @@ -70,7 +71,7 @@ var _ = Describe("Pravega Cluster", func() { s.AddKnownTypes(v1alpha1.SchemeGroupVersion, p) }) - Context("Pravega condition", func() { + Context("Cluster condition prior to Upgrade", func() { var ( client client.Client err error @@ -131,7 +132,7 @@ var _ = Describe("Pravega Cluster", func() { _, _ = r.Reconcile(req) }) - Context("Condition", func() { + Context("Upgrading Condition", func() { var ( foundPravega *v1alpha1.PravegaCluster ) @@ -158,7 +159,7 @@ var _ = Describe("Pravega Cluster", func() { BeforeEach(func() { sts = &appsv1.StatefulSet{} name := util.StatefulSetNameForBookie(p.Name) - _ = r.client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: p.Namespace}, sts) + //_ = r.client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: p.Namespace}, sts) sts.Status.ReadyReplicas = 1 r.client.Update(context.TODO(), sts) @@ -243,4 +244,127 @@ var _ = Describe("Pravega Cluster", func() { }) }) }) + + var _ = Describe("Rollback Test", func() { + var ( + req reconcile.Request + p *v1alpha1.PravegaCluster + ) + + BeforeEach(func() { + req = reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: Name, + Namespace: Namespace, + }, + } + p = &v1alpha1.PravegaCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: Name, + Namespace: Namespace, + }, + } + p.Spec.Version = "0.5.0" + s.AddKnownTypes(v1alpha1.SchemeGroupVersion, p) + }) + + Context("Cluster Condition before Rollback", func() { + var ( + client client.Client + err error + ) + + BeforeEach(func() { + client = fake.NewFakeClient(p) + r = &ReconcilePravegaCluster{client: client, scheme: s} + _, err = r.Reconcile(req) + }) + + Context("First reconcile", func() { + It("shouldn't error", func() { + Ω(err).Should(BeNil()) + }) + }) + + Context("Initial status", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, err = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should have current version set to spec version", func() { + Ω(foundPravega.Status.CurrentVersion).Should(Equal(foundPravega.Spec.Version)) + }) + + It("should not have rollback condition set", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) + Ω(rollbackCondition).Should(BeNil()) + }) + + It("should have version history set", func() { + history := foundPravega.Status.VersionHistory + Ω(history[0]).Should(Equal("0.5.0")) + }) + + }) + }) + + Context("Rollback to previous version", func() { + var ( + client client.Client + ) + + BeforeEach(func() { + + p.Spec = v1alpha1.ClusterSpec{ + Version: "0.6.0", + } + p.WithDefaults() + client = fake.NewFakeClient(p) + r = &ReconcilePravegaCluster{client: client, scheme: s} + _, _ = r.Reconcile(req) + foundPravega := &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + foundPravega.Spec.Version = "0.5.0" + foundPravega.Status.VersionHistory = []string{"0.5.0"} + // bypass the pods ready check in the upgrade logic + foundPravega.Status.SetPodsReadyConditionFalse() + foundPravega.Status.SetErrorConditionTrue("UpgradeFailed", "some error") + client.Update(context.TODO(), foundPravega) + _, _ = r.Reconcile(req) + + }) + + Context("Rollback Triggered", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + /* + It("should set Rollback condition status to be true", func() { + _, condition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) + fmt.Println(condition) + Ω(condition.Status).To(Equal(corev1.ConditionTrue)) + }) + */ + It("should set target version to previous version", func() { + fmt.Printf("AFTER Cluster Version:%v", foundPravega.Status.TargetVersion) + fmt.Println() + _, errCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionError) + + fmt.Printf("Error condition :%v", errCondition) + fmt.Println() + Ω(foundPravega.Status.TargetVersion).To(Equal(foundPravega.Spec.Version)) + }) + }) + }) + }) }) From 6b7a00a7639f4882dd94dc5dab4e46bd66cffb63 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Tue, 17 Sep 2019 08:49:15 -0700 Subject: [PATCH 08/13] doc changes Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 11 ------- doc/upgrade-cluster.md | 30 ++++++++++++++++--- pkg/controller/pravegacluster/upgrade.go | 1 - pkg/controller/pravegacluster/upgrade_test.go | 20 ++++--------- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index ed3454aa8..e27d51a36 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -88,14 +88,3 @@ Message:
``` Manual intervention would be needed for resolving this. - - - - - - - -## Pending tasks - - -## Prerequisites diff --git a/doc/upgrade-cluster.md b/doc/upgrade-cluster.md index 8e98f2692..ef956cecb 100644 --- a/doc/upgrade-cluster.md +++ b/doc/upgrade-cluster.md @@ -103,8 +103,7 @@ Segment Store instances need access to a persistent volume to store the cache. L Also, Segment Store pods need to be individually accessed by clients, so having a stable network identifier provided by the Statefulset and a headless service is very convenient. -Same as Bookkeeper, we use `OnDelete` strategy for Segment Store. The reason that we don't use `RollingUpdate` strategy here is that we found it convenient to manage the upgrade -and rollback in the same fashion. Using `RollingUpdate` will introduce Kubernetes rollback mechanism which will cause trouble to our implementation. +Same as Bookkeeper, we use `OnDelete` strategy for Segment Store. The reason that we don't use `RollingUpdate` strategy here is that we found it convenient to manage the upgrade and rollback in the same fashion. Using `RollingUpdate` will introduce Kubernetes rollback mechanism which will cause trouble to our implementation. ### Pravega Controller upgrade @@ -131,6 +130,29 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.5.0 8 8 1h ``` +To see progress of Upgrade, you can do a `kubectl describe` +``` +$ kubectl describe PravegaCluster example +... +Status: + Conditions: + Status: True + Type: Upgrading + Reason: Updating BookKeeper + Message: 1 + Last Transition Time: 2019-04-01T19:42:37+02:00 + Last Update Time: 2019-04-01T19:42:37+02:00 + Status: False + Type: PodsReady + Last Transition Time: 2019-04-01T19:43:08+02:00 + Last Update Time: 2019-04-01T19:43:08+02:00 + Status: False + Type: Error +... + +``` +The `Reason` field in Upgrading Condition shows the component currently being upgraded and `Message` field reflects number of successfully upgraded replicas in this component. + If your upgrade has failed, you can describe the status section of your Pravega cluster to discover why. ``` @@ -181,10 +203,10 @@ INFO[5899] Reconciling PravegaCluster default/example INFO[5900] statefulset (example-bookie) status: 1 updated, 2 ready, 3 target INFO[5929] Reconciling PravegaCluster default/example INFO[5930] statefulset (example-bookie) status: 1 updated, 2 ready, 3 target -INFO[5930] error syncing cluster version, need manual intervention. failed to sync bookkeeper version. pod example-bookie-0 is restarting +INFO[5930] error syncing cluster version, upgrade failed. failed to sync bookkeeper version. pod example-bookie-0 is restarting ... ``` ### Recovering from a failed upgrade -Not defined yet. Check [this issue](https://github.com/pravega/pravega-operator/issues/157) for tracking. +See [Rollback](rollback-cluster.md) diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 02a808ba1..f96ab4293 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -41,7 +41,6 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC // we cannot upgrade if cluster is in UpgradeFailed or Rollback state if p.Status.IsClusterInUpgradeFailedOrRollbackState() { - log.Println("Can't upgrade a Cluster in Upgrade Failed State. Please rollback first.") return nil } diff --git a/pkg/controller/pravegacluster/upgrade_test.go b/pkg/controller/pravegacluster/upgrade_test.go index c145a77ba..ced98c130 100644 --- a/pkg/controller/pravegacluster/upgrade_test.go +++ b/pkg/controller/pravegacluster/upgrade_test.go @@ -12,7 +12,6 @@ package pravegacluster import ( "context" - "fmt" "testing" "github.com/pravega/pravega-operator/pkg/apis/pravega/v1alpha1" @@ -348,20 +347,13 @@ var _ = Describe("Pravega Cluster Version Sync", func() { foundPravega = &v1alpha1.PravegaCluster{} _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) }) - /* - It("should set Rollback condition status to be true", func() { - _, condition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) - fmt.Println(condition) - Ω(condition.Status).To(Equal(corev1.ConditionTrue)) - }) - */ - It("should set target version to previous version", func() { - fmt.Printf("AFTER Cluster Version:%v", foundPravega.Status.TargetVersion) - fmt.Println() - _, errCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionError) - fmt.Printf("Error condition :%v", errCondition) - fmt.Println() + It("should set Rollback condition status to be true", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Status).To(Equal(corev1.ConditionTrue)) + }) + + It("should set target version to previous version", func() { Ω(foundPravega.Status.TargetVersion).To(Equal(foundPravega.Spec.Version)) }) }) From 4ad69d17cb9aa6039b1ba2454a6fc02e680f4482 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Tue, 17 Sep 2019 09:45:15 -0700 Subject: [PATCH 09/13] review comments Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 25 ++++++++++--------- pkg/apis/pravega/v1alpha1/status.go | 10 -------- .../pravegacluster_controller.go | 1 - pkg/controller/pravegacluster/upgrade.go | 3 +-- 4 files changed, 14 insertions(+), 25 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index e27d51a36..8e686205f 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -12,7 +12,7 @@ An Upgrade can fail because of following reasons: 3. K8s Cluster Issues. 4. Application issues (Application runtime misconfiguration or code bugs) -An upgrade failure can manifest through a Pod to staying in `Pending` state forever or continuously restarting or crashing (CrashLoopBackOff). +An upgrade failure can manifest through a Pod staying in `Pending` state forever or continuously restarting or crashing (CrashLoopBackOff). A component deployment failure needs to be tracked and mapped to "Upgrade Failure" for Pravega Cluster. Here we try to fail-fast by explicitly checking for some common causes for deployment failure like image pull errors or CrashLoopBackOff State and failing the upgrade if any pod runs into this state during upgrade. @@ -24,7 +24,7 @@ Status: True Reason: UpgradeFailed Message:
``` -After an Upgrade Failure output of `kubectl describe pravegacluster pravega` would look like this: +After an Upgrade Failure the output of `kubectl describe pravegacluster pravega` would look like this: ``` $> kubectl describe pravegacluster pravega @@ -59,27 +59,28 @@ Version History: where `0.6.0-2252.b6f6512` is the version we tried upgrading to and `0.6.0-2239.6e24df7` is the version before upgrade. ## Manual Rollback Trigger -A Rollback is triggered when a Pravgea Cluster is `UpgradeFailed` Error State and a user manually updates in the PravegaCluster spec the version field to point to cluster version prior to upgrade. +A Rollback is triggered when a Pravega Cluster is in `UpgradeFailed` Error State and a user manually updates version feild in the PravegaCluster spec to point to the last stable cluster version. Note: -1. Rollback to any other cluster version (other than the previousVersion) is not supported at this point. -2. Changing the cluster spec version to the previous cluster version, when cluster is not in `UpgradeFailed` state, will trigger a rollback, but will be treated like a regular upgrade. +1. Rollback to only the last stable cluster version is supported at this point. +2. Changing the cluster spec version to the previous cluster version, when cluster is not in `UpgradeFailed` state, will not trigger a rollback. ## Rollback Implementation -When Rollback is started cluster moves into ClusterCondition `RollbackInProgress`. -Once Rollback completes this condition is set to false. -The order in which the components are rolled back the reverse as upgrade : +When Rollback is started the cluster moves into ClusterCondition `RollbackInProgress`. +Once the Rollback completes, this condition is set to false. + +The operator rolls back components following the reverse upgrade order : 1. Pravega Controller 2. Pravega Segment Store 3. BookKeeper -A new field `versionHistory` has been added to Pravega ClusterStatus to maintain history of upgrades. +A new field `versionHistory` has been added to Pravega ClusterStatus to maintain the history of upgrades. -Rollback involves moving all components in the cluster back to the previous cluster version. As in case of upgrade, operator would rollback one component at a time and one pod at a time to maintain HA. +Rollback involves moving all components in the cluster back to the last stable cluster version. As with upgrades, the operator rolls back one component at a time and one pod at a time to preserve high-availability. -If Rollback completes successfully, cluster state goes back to `PodsReady` which would mean the cluster is now in a stable state. -If Rollback Fails, the cluster would move to state `RollbackFailed` indicated by this cluster condition: +If the Rollback completes successfully, the cluster state goes back to `PodsReady`, which would mean the cluster is now in a stable state. +If the Rollback Fails, the cluster would move to state `RollbackFailed` indicated by this cluster condition: ``` ClusterConditionType: Error Status: True diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index 202a6b56d..309d9c399 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -128,16 +128,6 @@ func (ps *ClusterStatus) SetUpgradingConditionFalse() { ps.setClusterCondition(*c) } -/* -func (ps *ClusterStatus) SetUpdatedReplicasForComponent(componentName string, updatedReplicas int32, totalReplicas int32) { - _, upgradeCondition := ps.GetClusterCondition(ClusterConditionUpgrading) - if upgradeCondition != nil && upgradeCondition.Status == corev1.ConditionTrue { - reason := fmt.Sprintf("Updating component: %s. Updated Replicas: %v, Total Replicas: %v", componentName, updatedReplicas, totalReplicas) - c := newClusterCondition(ClusterConditionUpgrading, corev1.ConditionTrue, reason, message) - ps.setClusterCondition(*c) - } -} -*/ func (ps *ClusterStatus) SetErrorConditionTrue(reason, message string) { c := newClusterCondition(ClusterConditionError, corev1.ConditionTrue, reason, message) ps.setClusterCondition(*c) diff --git a/pkg/controller/pravegacluster/pravegacluster_controller.go b/pkg/controller/pravegacluster/pravegacluster_controller.go index abffb6e3f..68143149b 100644 --- a/pkg/controller/pravegacluster/pravegacluster_controller.go +++ b/pkg/controller/pravegacluster/pravegacluster_controller.go @@ -158,7 +158,6 @@ func (r *ReconcilePravegaCluster) run(p *pravegav1alpha1.PravegaCluster) (err er } func (r *ReconcilePravegaCluster) deployCluster(p *pravegav1alpha1.PravegaCluster) (err error) { - err = r.deployBookie(p) if err != nil { log.Printf("failed to deploy bookie: %v", err) diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index f96ab4293..bf17f17d8 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -140,7 +140,7 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav syncCompleted, err := r.syncComponentsVersion(p) if err != nil { - // error rolling back, set appropriate status and ask for manual intervention + // Error rolling back, set appropriate status and ask for manual intervention p.Status.SetErrorConditionTrue("RollbackFailed", err.Error()) r.clearRollbackStatus(p) log.Printf("Error rolling back to cluster version %v. Reason: %v", version, err) @@ -397,7 +397,6 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { - //p.Status.SetUpdatedReplicasForComponent(name, sts.Status.UpdatedReplicas, sts.Status.Replicas) p.Status.UpdateProgress(pravegav1alpha1.UpdatingBookkeeperReason, "0") // Need to update pod template // This will trigger the rolling upgrade process From 84dc4cd1e3494a236210ff4f85ff00cf22ba1d01 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Wed, 18 Sep 2019 05:57:33 -0700 Subject: [PATCH 10/13] added more unit tests Signed-off-by: pbelgundi --- pkg/apis/pravega/v1alpha1/status.go | 2 +- pkg/controller/pravegacluster/upgrade.go | 1 + pkg/controller/pravegacluster/upgrade_test.go | 87 ++++++++++++++++++- 3 files changed, 85 insertions(+), 5 deletions(-) diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index 309d9c399..fe9d2603d 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -139,7 +139,7 @@ func (ps *ClusterStatus) SetErrorConditionFalse() { } func (ps *ClusterStatus) SetRollbackConditionTrue(reason, message string) { - c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, "", "") + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, reason, message) ps.setClusterCondition(*c) } func (ps *ClusterStatus) SetRollbackConditionFalse() { diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index bf17f17d8..dc9c84d01 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -244,6 +244,7 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave if deploy.Spec.Template.Spec.Containers[0].Image != targetImage { p.Status.UpdateProgress(pravegav1alpha1.UpdatingControllerReason, "0") + // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating deployment (%s) pod template image to '%s'", deploy.Name, targetImage) diff --git a/pkg/controller/pravegacluster/upgrade_test.go b/pkg/controller/pravegacluster/upgrade_test.go index ced98c130..c461d062d 100644 --- a/pkg/controller/pravegacluster/upgrade_test.go +++ b/pkg/controller/pravegacluster/upgrade_test.go @@ -158,7 +158,7 @@ var _ = Describe("Pravega Cluster Version Sync", func() { BeforeEach(func() { sts = &appsv1.StatefulSet{} name := util.StatefulSetNameForBookie(p.Name) - //_ = r.client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: p.Namespace}, sts) + _ = r.client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: p.Namespace}, sts) sts.Status.ReadyReplicas = 1 r.client.Update(context.TODO(), sts) @@ -318,7 +318,6 @@ var _ = Describe("Pravega Cluster Version Sync", func() { ) BeforeEach(func() { - p.Spec = v1alpha1.ClusterSpec{ Version: "0.6.0", } @@ -335,7 +334,6 @@ var _ = Describe("Pravega Cluster Version Sync", func() { foundPravega.Status.SetErrorConditionTrue("UpgradeFailed", "some error") client.Update(context.TODO(), foundPravega) _, _ = r.Reconcile(req) - }) Context("Rollback Triggered", func() { @@ -343,7 +341,6 @@ var _ = Describe("Pravega Cluster Version Sync", func() { foundPravega *v1alpha1.PravegaCluster ) BeforeEach(func() { - foundPravega = &v1alpha1.PravegaCluster{} _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) }) @@ -357,6 +354,88 @@ var _ = Describe("Pravega Cluster Version Sync", func() { Ω(foundPravega.Status.TargetVersion).To(Equal(foundPravega.Spec.Version)) }) }) + + Context("Rollback Controller", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingController and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingControllerReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + + Context("Rollback SegmentStore", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingSegmentStore and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingSegmentstoreReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + + Context("Rollback Bookkeeper", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingBookkeeper and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingBookkeeperReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + Context("Rollback Completed", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set currentversion equal to target version", func() { + Ω(foundPravega.Status.CurrentVersion).Should(Equal("0.5.0")) + }) + It("should set TargetVersoin to empty", func() { + Ω(foundPravega.Status.TargetVersion).Should(Equal("")) + }) + It("should set rollback condition to false", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Status).To(Equal(corev1.ConditionFalse)) + }) + It("should set error condition to false", func() { + _, errorCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionError) + Ω(errorCondition.Status).To(Equal(corev1.ConditionFalse)) + }) + }) }) }) }) From efa28023df56c6ad428adb2e8d742128584ed17d Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Thu, 19 Sep 2019 05:24:39 -0700 Subject: [PATCH 11/13] updated documentation Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 5 ++++- doc/upgrade-cluster.md | 15 +++++++++++++-- pkg/apis/pravega/v1alpha1/status.go | 11 +++++++++++ pkg/controller/pravegacluster/upgrade.go | 11 +++++++---- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index 8e686205f..b1e85a0f3 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -88,4 +88,7 @@ Reason: RollbackFailed Message:
``` -Manual intervention would be needed for resolving this. +When a rollback failure happens, manual intervention would be required to solve this. +After checking and resolving the root cause of failure, a user can upgrade to : +1. The version to which a user initially intended to upgrade.(which caused upgrade failure) +2. To any other supported version based versions of all pods in the cluster. diff --git a/doc/upgrade-cluster.md b/doc/upgrade-cluster.md index ef956cecb..bd3ea4154 100644 --- a/doc/upgrade-cluster.md +++ b/doc/upgrade-cluster.md @@ -20,8 +20,6 @@ Check out [Pravega documentation](http://pravega.io/docs/latest/) for more infor ## Pending tasks -- The rollback mechanism is on the roadmap but not implemented yet. Check out [this issue](https://github.com/pravega/pravega-operator/issues/153). -- Manual recovery from an upgrade is possible but it has not been defined yet. Check out [this issue](https://github.com/pravega/pravega-operator/issues/157). - There is no validation of the configured desired version. Check out [this issue](https://github.com/pravega/pravega-operator/issues/156) @@ -35,6 +33,19 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.4.0 7 7 11m ``` +## Upgrade Supported Versions Matrix + +| BASE VERSION | TARGET VERSION | +| ------------ | ---------------- | +| 0.1.0 | 0.1.0 | +| 0.2.0 | 0.2.0 | +| 0.3.0 | 0.3.0, 0.3.1, 0.3.2| +| 0.3.1 | 0.3.1, 0.3.2 | +| 0.3.2 | 0.3.2 | +| 0.4.0 | 0.4.0 | +| 0.5.0 | 0.5.0, 0.6.0 | +| 0.6.0 | 0.6.0 | + ## Trigger an upgrade To initiate an upgrade process, a user has to update the `spec.version` field on the `PravegaCluster` custom resource. This can be done in three different ways using the `kubectl` command. diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index fe9d2603d..572145cd3 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -244,6 +244,17 @@ func (ps *ClusterStatus) IsClusterInUpgradingState() bool { return false } +func (ps *ClusterStatus) IsClusterInRollbackFailedState() bool { + _, errorCondition := ps.GetClusterCondition(ClusterConditionError) + if errorCondition == nil { + return false + } + if errorCondition.Status == corev1.ConditionTrue && errorCondition.Reason == "RollbackFailed" { + return true + } + return false +} + func (ps *ClusterStatus) UpdateProgress(reason, updatedReplicas string) { if ps.IsClusterInUpgradingState() { // Set the upgrade condition reason to be UpgradingBookkeeperReason, message to be 0 diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index dc9c84d01..c8445fbdd 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -90,10 +90,13 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC return nil } - if readyCondition == nil || readyCondition.Status != corev1.ConditionTrue { - r.clearUpgradeStatus(p) - log.Print("cannot trigger upgrade if there are unready pods") - return nil + if !p.Status.IsClusterInRollbackFailedState() { + // skip this check when cluster is in RollbackFailed state + if readyCondition == nil || readyCondition.Status != corev1.ConditionTrue { + r.clearUpgradeStatus(p) + log.Print("cannot trigger upgrade if there are unready pods") + return nil + } } // Need to sync cluster versions From f7b770fef735735154b4f432d03ddbc4662cb261 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Thu, 19 Sep 2019 07:58:20 -0700 Subject: [PATCH 12/13] fix Signed-off-by: pbelgundi --- pkg/controller/pravegacluster/upgrade.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index c8445fbdd..68c496632 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -97,6 +97,9 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC log.Print("cannot trigger upgrade if there are unready pods") return nil } + } else { + // We are upgrading after a rollback failure, reset Error Status + p.Status.SetErrorConditionFalse() } // Need to sync cluster versions @@ -125,6 +128,9 @@ func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaC } func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.PravegaCluster, version string) (err error) { + defer func() { + r.client.Status().Update(context.TODO(), p) + }() _, rollbackCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) if rollbackCondition == nil || rollbackCondition.Status != corev1.ConditionTrue { // We're in the first iteration for Rollback @@ -147,6 +153,7 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav p.Status.SetErrorConditionTrue("RollbackFailed", err.Error()) r.clearRollbackStatus(p) log.Printf("Error rolling back to cluster version %v. Reason: %v", version, err) + //r.client.Status().Update(context.TODO(), p) return err } @@ -158,7 +165,7 @@ func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.Prav r.clearRollbackStatus(p) log.Printf("Rollback to version %v completed for all pravega components.", version) } - r.client.Status().Update(context.TODO(), p) + //r.client.Status().Update(context.TODO(), p) return nil } From 33adccc8de1628742548b1700f03c5cce9f6cf81 Mon Sep 17 00:00:00 2001 From: pbelgundi Date: Fri, 20 Sep 2019 04:37:36 -0700 Subject: [PATCH 13/13] doc changes Signed-off-by: pbelgundi --- doc/rollback-cluster.md | 110 ++++++++++++++++++++--- doc/upgrade-cluster.md | 6 +- pkg/controller/pravegacluster/upgrade.go | 2 +- 3 files changed, 104 insertions(+), 14 deletions(-) diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md index b1e85a0f3..d499643c8 100644 --- a/doc/rollback-cluster.md +++ b/doc/rollback-cluster.md @@ -56,31 +56,97 @@ Conditions: Version History: 0.6.0-2239.6e24df7 ``` -where `0.6.0-2252.b6f6512` is the version we tried upgrading to and `0.6.0-2239.6e24df7` is the version before upgrade. +where `0.6.0-2252.b6f6512` is the version we tried upgrading to and `0.6.0-2239.6e24df7` is the cluster version prior to triggering the upgrade. ## Manual Rollback Trigger + A Rollback is triggered when a Pravega Cluster is in `UpgradeFailed` Error State and a user manually updates version feild in the PravegaCluster spec to point to the last stable cluster version. +A Rollback involves moving all components in the cluster back to the last stable cluster version. As with upgrades, the operator rolls back one component at a time and one pod at a time to preserve high-availability. + Note: -1. Rollback to only the last stable cluster version is supported at this point. +1. A Rollback to only the last stable cluster version is supported at this point. 2. Changing the cluster spec version to the previous cluster version, when cluster is not in `UpgradeFailed` state, will not trigger a rollback. ## Rollback Implementation -When Rollback is started the cluster moves into ClusterCondition `RollbackInProgress`. + +When Rollback is triggered the cluster moves into ClusterCondition `RollbackInProgress`. Once the Rollback completes, this condition is set to false. +During a Rollback, the Cluster Status should look something like: +``` +$> kubectl describe pravegacluster pravega +. . . +Status: + Conditions: + Last Transition Time: 2019-09-20T10:41:10Z + Last Update Time: 2019-09-20T10:41:10Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T10:45:12Z + Last Update Time: 2019-09-20T10:45:12Z + Status: True + Type: PodsReady + Last Transition Time: 2019-09-20T10:41:10Z + Last Update Time: 2019-09-20T10:41:10Z + Message: failed to sync segmentstore version. pod pravega-pravega-segmentstore-0 update failed because of ImagePullBackOff + Reason: UpgradeFailed + Status: True + Type: Error + Last Update Time: 2019-09-20T10:45:12Z + Message: 1 + Reason: Updating Bookkeeper + Status: True + Type: RollbackInProgress +. . . +``` +Here the `RollbackInProgress` condition being `true` indicates that a Rollback is in Progress. +Also `Reason` and `Message` feilds of this condition indicate the component being rolled back and number of updated replicas respectively. + The operator rolls back components following the reverse upgrade order : 1. Pravega Controller 2. Pravega Segment Store 3. BookKeeper -A new field `versionHistory` has been added to Pravega ClusterStatus to maintain the history of upgrades. +A `versionHistory` field in the PravegaClusterSpec maintains the history of upgrades. + +## Rollback Outcome -Rollback involves moving all components in the cluster back to the last stable cluster version. As with upgrades, the operator rolls back one component at a time and one pod at a time to preserve high-availability. +### Success +If the Rollback completes successfully, the cluster state goes back to condition `PodsReady`, which would mean the cluster is now in a stable state. All other conditions should be `false`. +``` +Last Transition Time: 2019-09-20T09:49:26Z +Last Update Time: 2019-09-20T09:49:26Z +Status: True +Type: PodsReady + +``` -If the Rollback completes successfully, the cluster state goes back to `PodsReady`, which would mean the cluster is now in a stable state. -If the Rollback Fails, the cluster would move to state `RollbackFailed` indicated by this cluster condition: +Example: +``` +Status: + Conditions: + Last Transition Time: 2019-09-20T10:12:04Z + Last Update Time: 2019-09-20T10:12:04Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T10:11:34Z + Last Update Time: 2019-09-20T10:11:34Z + Status: True + Type: PodsReady + Last Transition Time: 2019-09-20T10:07:19Z + Last Update Time: 2019-09-20T10:07:19Z + Status: False + Type: Error + Last Transition Time: 2019-09-20T09:50:57Z + Last Update Time: 2019-09-20T09:50:57Z + Status: False + Type: RollbackInProgress +``` + +### Failure +If the Rollback Fails, the cluster would move to `Error` state indicated by this cluster condition: ``` ClusterConditionType: Error Status: True @@ -88,7 +154,31 @@ Reason: RollbackFailed Message:
``` -When a rollback failure happens, manual intervention would be required to solve this. -After checking and resolving the root cause of failure, a user can upgrade to : -1. The version to which a user initially intended to upgrade.(which caused upgrade failure) +Example: +``` +Status: + Conditions: + Last Transition Time: 2019-09-20T09:46:24Z + Last Update Time: 2019-09-20T09:46:24Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T09:49:26Z + Last Update Time: 2019-09-20T09:49:26Z + Status: False + Type: PodsReady + Last Transition Time: 2019-09-20T09:46:24Z + Last Update Time: 2019-09-20T09:50:57Z + Message: failed to sync bookkeeper version. pod pravega-bookie-0 update failed because of ImagePullBackOff + Reason: RollbackFailed + Status: True + Type: Error + Last Transition Time: 2019-09-20T09:50:57Z + Last Update Time: 2019-09-20T09:50:57Z + Status: False + Type: RollbackInProgress +``` + +When a rollback failure happens, manual intervention would be required to resolve this. +After checking and solving the root cause of failure, to bring the cluster back to a stable state, a user can upgrade to: +1. The version to which a user initially intended to upgrade.(when upgrade failure was noticed) 2. To any other supported version based versions of all pods in the cluster. diff --git a/doc/upgrade-cluster.md b/doc/upgrade-cluster.md index bd3ea4154..1d9c1fcfb 100644 --- a/doc/upgrade-cluster.md +++ b/doc/upgrade-cluster.md @@ -33,7 +33,7 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.4.0 7 7 11m ``` -## Upgrade Supported Versions Matrix +## Upgrade Path Matrix | BASE VERSION | TARGET VERSION | | ------------ | ---------------- | @@ -141,7 +141,7 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.5.0 8 8 1h ``` -To see progress of Upgrade, you can do a `kubectl describe` +The command `kubectl describe` can be used to track progress of the upgrade. ``` $ kubectl describe PravegaCluster example ... @@ -164,7 +164,7 @@ Status: ``` The `Reason` field in Upgrading Condition shows the component currently being upgraded and `Message` field reflects number of successfully upgraded replicas in this component. -If your upgrade has failed, you can describe the status section of your Pravega cluster to discover why. +If upgrade has failed, please check the `Status` section to understand the reason for failure. ``` $ kubectl describe PravegaCluster example diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 68c496632..533dedf02 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -77,7 +77,7 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC if syncCompleted { // All component versions have been synced - p.Status.AddToVersionHistory(p.Status.CurrentVersion) + p.Status.AddToVersionHistory(p.Status.TargetVersion) p.Status.CurrentVersion = p.Status.TargetVersion log.Printf("Upgrade completed for all pravega components.") }