From fa564a24b1e3b3d0190896f3c8276870ab2b742d Mon Sep 17 00:00:00 2001 From: Christian Bianchi Date: Mon, 21 Jun 2021 15:04:19 +0200 Subject: [PATCH] Avoid vmss upgrade stuck (#1475) * Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid. * Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid. * Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid. * Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid. --- CHANGELOG.md | 4 ++++ pkg/handler/nodes/state/error.go | 8 ++++++++ pkg/handler/nodes/state/funcs.go | 2 +- pkg/handler/nodes/state/funcs_test.go | 4 ++-- pkg/label/label.go | 2 ++ .../create_master_instances_upgrading.go | 4 +++- .../handler/nodepool/create.go | 12 ++++++++++- .../handler/nodepool/create_scale_workers.go | 20 +++++++++++++++++++ 8 files changed, 51 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffb09782e7..14a32471de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `AzureClusterIdentity`, and the secret it references are created in the `AzureCluster` namespace instead of `giantswarm`. - Don't update `AzureClusterIdentity` CR's that are not managed by azure-operator. +### Fixed + +- Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid. + ## [5.7.0] - 2021-05-13 ### Changed diff --git a/pkg/handler/nodes/state/error.go b/pkg/handler/nodes/state/error.go index 4190740a1d..c48fdbf476 100644 --- a/pkg/handler/nodes/state/error.go +++ b/pkg/handler/nodes/state/error.go @@ -13,3 +13,11 @@ import "github.com/giantswarm/microerror" var executionFailedError = µerror.Error{ Kind: "executionFailedError", } + +var unknownStateError = µerror.Error{ + Kind: "unknownStateError", +} + +func IsUnkownStateError(err error) bool { + return microerror.Cause(err) == unknownStateError +} diff --git a/pkg/handler/nodes/state/funcs.go b/pkg/handler/nodes/state/funcs.go index b7403bc979..f0fd35c5f8 100644 --- a/pkg/handler/nodes/state/funcs.go +++ b/pkg/handler/nodes/state/funcs.go @@ -9,7 +9,7 @@ import ( func (m Machine) Execute(ctx context.Context, obj interface{}, currentState State) (State, error) { transitionFunc, exists := m.Transitions[currentState] if !exists { - return "", microerror.Maskf(executionFailedError, "State: %q is not configured in this state machine", currentState) + return "", microerror.Maskf(unknownStateError, "State: %q is not configured in this state machine", currentState) } newState, err := transitionFunc(ctx, obj, currentState) diff --git a/pkg/handler/nodes/state/funcs_test.go b/pkg/handler/nodes/state/funcs_test.go index cf795f681c..99d6704f7a 100644 --- a/pkg/handler/nodes/state/funcs_test.go +++ b/pkg/handler/nodes/state/funcs_test.go @@ -54,7 +54,7 @@ func Test_StateMachine(t *testing.T) { }, currentState: "half-way", expectedNewState: "", - errorMatcher: IsExecutionFailedError, + errorMatcher: IsUnkownStateError, }, { name: "case 2: unknown new state", @@ -75,7 +75,7 @@ func Test_StateMachine(t *testing.T) { machine: Machine{}, currentState: "start", expectedNewState: "", - errorMatcher: IsExecutionFailedError, + errorMatcher: IsUnkownStateError, }, } diff --git a/pkg/label/label.go b/pkg/label/label.go index 9d2afd822f..5be064e516 100644 --- a/pkg/label/label.go +++ b/pkg/label/label.go @@ -16,4 +16,6 @@ const ( ClusterOperatorVersion = "cluster-operator.giantswarm.io/version" ReleaseVersion = "release.giantswarm.io/version" SingleTenantSP = "giantswarm.io/single-tenant-service-principal" + + AzureOperatorVersionTag = "gs-azure-operator.giantswarm.io-version" ) diff --git a/service/controller/azureconfig/handler/masters/create_master_instances_upgrading.go b/service/controller/azureconfig/handler/masters/create_master_instances_upgrading.go index a9201db8d3..6bbd95ecfa 100644 --- a/service/controller/azureconfig/handler/masters/create_master_instances_upgrading.go +++ b/service/controller/azureconfig/handler/masters/create_master_instances_upgrading.go @@ -6,6 +6,8 @@ import ( "github.com/giantswarm/errors/tenant" "github.com/giantswarm/tenantcluster/v3/pkg/tenantcluster" + "github.com/giantswarm/azure-operator/v5/pkg/label" + "github.com/giantswarm/azure-operator/v5/pkg/project" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" @@ -151,7 +153,7 @@ func (r *Resource) isMastersVmssUpToDate(ctx context.Context, azureConfig *provi return false, microerror.Mask(err) } - azureOperatorVersionTag, ok := mastersVMSS.Tags["gs-azure-operator.giantswarm.io-version"] + azureOperatorVersionTag, ok := mastersVMSS.Tags[label.AzureOperatorVersionTag] if !ok || *azureOperatorVersionTag != project.Version() { return false, nil } diff --git a/service/controller/azuremachinepool/handler/nodepool/create.go b/service/controller/azuremachinepool/handler/nodepool/create.go index f3c45ff33a..83b75db6fe 100644 --- a/service/controller/azuremachinepool/handler/nodepool/create.go +++ b/service/controller/azuremachinepool/handler/nodepool/create.go @@ -53,7 +53,17 @@ func (r *Resource) EnsureCreated(ctx context.Context, obj interface{}) error { r.Logger.Debugf(ctx, "current state: %s", currentState) newState, err = r.StateMachine.Execute(ctx, obj, currentState) - if err != nil { + if state.IsUnkownStateError(err) { + // This can happen if there is a race condition with a previous version of the azure operator + // or if the node pool at upgrade time was in a state that doesn't exists any more in this azure + // operator version. + // At this stage if this error happened while upgrading to a new release and the ARM deployment was already applied + // we need to ensure nodes are going to be rolled out. + // We move directly to `ScaleUpWorkerVMSS`. If for any reason the ARM deployment is not applied then the + // `ScaleUpWorkerVMSS` handler will detect the situation and go back to the `DeploymentUninitialized` state. + r.Logger.Debugf(ctx, "Azure Machine Pool was in state %q that is unknown to this azure operator version's state machine. To avoid blocking an upgrade the state will be set to %q.", currentState, ScaleUpWorkerVMSS) + newState = ScaleUpWorkerVMSS + } else if err != nil { return microerror.Mask(err) } } diff --git a/service/controller/azuremachinepool/handler/nodepool/create_scale_workers.go b/service/controller/azuremachinepool/handler/nodepool/create_scale_workers.go index 7f229d84e4..e0d1fc82bf 100644 --- a/service/controller/azuremachinepool/handler/nodepool/create_scale_workers.go +++ b/service/controller/azuremachinepool/handler/nodepool/create_scale_workers.go @@ -94,6 +94,26 @@ func (r *Resource) scaleUpWorkerVMSSTransition(ctx context.Context, obj interfac return currentState, nil } + virtualMachineScaleSetsClient, err := r.ClientFactory.GetVirtualMachineScaleSetsClient(ctx, azureMachinePool.ObjectMeta) + if err != nil { + return currentState, microerror.Mask(err) + } + + vmss, err := virtualMachineScaleSetsClient.Get(ctx, key.ClusterID(&azureMachinePool), key.NodePoolVMSSName(&azureMachinePool)) + if IsNotFound(err) { + // vmss not found, we need to apply the deployment again. + r.Logger.Debugf(ctx, "Node Pool VMSS was not found, going back to initial state.") + return DeploymentUninitialized, nil + } else if err != nil { + return currentState, microerror.Mask(err) + } + + // Check if the azure operator tag is up to date. + if currentVersion, found := vmss.Tags[label.AzureOperatorVersionTag]; !found || *currentVersion != project.Version() { + r.Logger.Debugf(ctx, "Node Pool VMSS's has an outdated %q label.", label.AzureOperatorVersionTag) + return DeploymentUninitialized, nil + } + oldInstances, newInstances, err := r.splitInstancesByUpdatedStatus(ctx, azureMachinePool) if tenantcluster.IsAPINotAvailableError(err) { r.Logger.Debugf(ctx, "tenant API not available yet")