Skip to content
This repository has been archived by the owner on Nov 30, 2023. It is now read-only.

Commit

Permalink
Avoid vmss upgrade stuck (#1475)
Browse files Browse the repository at this point in the history
* Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid.

* Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid.

* Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid.

* Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid.
  • Loading branch information
Christian Bianchi authored Jun 21, 2021
1 parent 7f50563 commit fa564a2
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `AzureClusterIdentity`, and the secret it references are created in the `AzureCluster` namespace instead of `giantswarm`.
- Don't update `AzureClusterIdentity` CR's that are not managed by azure-operator.

### Fixed

- Don't get the node pool upgrade stuck if the current state of `AzureMachinePool` is invalid.

## [5.7.0] - 2021-05-13

### Changed
Expand Down
8 changes: 8 additions & 0 deletions pkg/handler/nodes/state/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ import "github.com/giantswarm/microerror"
var executionFailedError = &microerror.Error{
Kind: "executionFailedError",
}

var unknownStateError = &microerror.Error{
Kind: "unknownStateError",
}

func IsUnkownStateError(err error) bool {
return microerror.Cause(err) == unknownStateError
}
2 changes: 1 addition & 1 deletion pkg/handler/nodes/state/funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
func (m Machine) Execute(ctx context.Context, obj interface{}, currentState State) (State, error) {
transitionFunc, exists := m.Transitions[currentState]
if !exists {
return "", microerror.Maskf(executionFailedError, "State: %q is not configured in this state machine", currentState)
return "", microerror.Maskf(unknownStateError, "State: %q is not configured in this state machine", currentState)
}

newState, err := transitionFunc(ctx, obj, currentState)
Expand Down
4 changes: 2 additions & 2 deletions pkg/handler/nodes/state/funcs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func Test_StateMachine(t *testing.T) {
},
currentState: "half-way",
expectedNewState: "",
errorMatcher: IsExecutionFailedError,
errorMatcher: IsUnkownStateError,
},
{
name: "case 2: unknown new state",
Expand All @@ -75,7 +75,7 @@ func Test_StateMachine(t *testing.T) {
machine: Machine{},
currentState: "start",
expectedNewState: "",
errorMatcher: IsExecutionFailedError,
errorMatcher: IsUnkownStateError,
},
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/label/label.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ const (
ClusterOperatorVersion = "cluster-operator.giantswarm.io/version"
ReleaseVersion = "release.giantswarm.io/version"
SingleTenantSP = "giantswarm.io/single-tenant-service-principal"

AzureOperatorVersionTag = "gs-azure-operator.giantswarm.io-version"
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
"github.com/giantswarm/errors/tenant"
"github.com/giantswarm/tenantcluster/v3/pkg/tenantcluster"

"github.com/giantswarm/azure-operator/v5/pkg/label"

"github.com/giantswarm/azure-operator/v5/pkg/project"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
Expand Down Expand Up @@ -151,7 +153,7 @@ func (r *Resource) isMastersVmssUpToDate(ctx context.Context, azureConfig *provi
return false, microerror.Mask(err)
}

azureOperatorVersionTag, ok := mastersVMSS.Tags["gs-azure-operator.giantswarm.io-version"]
azureOperatorVersionTag, ok := mastersVMSS.Tags[label.AzureOperatorVersionTag]
if !ok || *azureOperatorVersionTag != project.Version() {
return false, nil
}
Expand Down
12 changes: 11 additions & 1 deletion service/controller/azuremachinepool/handler/nodepool/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,17 @@ func (r *Resource) EnsureCreated(ctx context.Context, obj interface{}) error {

r.Logger.Debugf(ctx, "current state: %s", currentState)
newState, err = r.StateMachine.Execute(ctx, obj, currentState)
if err != nil {
if state.IsUnkownStateError(err) {
// This can happen if there is a race condition with a previous version of the azure operator
// or if the node pool at upgrade time was in a state that doesn't exists any more in this azure
// operator version.
// At this stage if this error happened while upgrading to a new release and the ARM deployment was already applied
// we need to ensure nodes are going to be rolled out.
// We move directly to `ScaleUpWorkerVMSS`. If for any reason the ARM deployment is not applied then the
// `ScaleUpWorkerVMSS` handler will detect the situation and go back to the `DeploymentUninitialized` state.
r.Logger.Debugf(ctx, "Azure Machine Pool was in state %q that is unknown to this azure operator version's state machine. To avoid blocking an upgrade the state will be set to %q.", currentState, ScaleUpWorkerVMSS)
newState = ScaleUpWorkerVMSS
} else if err != nil {
return microerror.Mask(err)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,26 @@ func (r *Resource) scaleUpWorkerVMSSTransition(ctx context.Context, obj interfac
return currentState, nil
}

virtualMachineScaleSetsClient, err := r.ClientFactory.GetVirtualMachineScaleSetsClient(ctx, azureMachinePool.ObjectMeta)
if err != nil {
return currentState, microerror.Mask(err)
}

vmss, err := virtualMachineScaleSetsClient.Get(ctx, key.ClusterID(&azureMachinePool), key.NodePoolVMSSName(&azureMachinePool))
if IsNotFound(err) {
// vmss not found, we need to apply the deployment again.
r.Logger.Debugf(ctx, "Node Pool VMSS was not found, going back to initial state.")
return DeploymentUninitialized, nil
} else if err != nil {
return currentState, microerror.Mask(err)
}

// Check if the azure operator tag is up to date.
if currentVersion, found := vmss.Tags[label.AzureOperatorVersionTag]; !found || *currentVersion != project.Version() {
r.Logger.Debugf(ctx, "Node Pool VMSS's has an outdated %q label.", label.AzureOperatorVersionTag)
return DeploymentUninitialized, nil
}

oldInstances, newInstances, err := r.splitInstancesByUpdatedStatus(ctx, azureMachinePool)
if tenantcluster.IsAPINotAvailableError(err) {
r.Logger.Debugf(ctx, "tenant API not available yet")
Expand Down

0 comments on commit fa564a2

Please sign in to comment.