Skip to content

Commit

Permalink
refactor amcp and ammp to use transient error with requeue
Browse files Browse the repository at this point in the history
This was done to take into account initial creation of
agent pools. Agent pools were making repeated 404 calls with
CreateOrUpdate when it was not needed. Now they wait for initialization
of the managed control plane prior to reconciling. This seems to be the
intent of initially reconciling the agent pools when the AKS instance is
first constructed.
  • Loading branch information
devigned committed Nov 8, 2021
1 parent d282228 commit 6189796
Show file tree
Hide file tree
Showing 13 changed files with 83 additions and 53 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ KUBETEST_WINDOWS_CONF_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/upstream-window
KUBETEST_REPO_LIST_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/)
AZURE_TEMPLATES := $(E2E_DATA_DIR)/infrastructure-azure

# use the project local tool binaries first
export PATH := $(TOOLS_BIN_DIR):$(PATH)

# set --output-base used for conversion-gen which needs to be different for in GOPATH and outside GOPATH dev
ifneq ($(abspath $(ROOT_DIR)),$(GOPATH)/src/sigs.k8s.io/cluster-api-provider-azure)
OUTPUT_BASE := --output-base=$(ROOT_DIR)
Expand Down Expand Up @@ -169,7 +172,7 @@ test-cover: envs-test $(KUBECTL) $(KUBE_APISERVER) $(ETCD) ## Run tests with cod
go tool cover -html=coverage.out -o coverage.html

.PHONY: test-e2e-run
test-e2e-run: generate-e2e-templates $(ENVSUBST) $(KUBECTL) $(GINKGO) ## Run e2e tests
test-e2e-run: generate-e2e-templates $(ENVSUBST) $(KUSTOMIZE) $(KUBECTL) $(GINKGO) ## Run e2e tests
$(ENVSUBST) < $(E2E_CONF_FILE) > $(E2E_CONF_FILE_ENVSUBST) && \
$(GINKGO) -v -trace -tags=e2e -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) --noColor=$(GINKGO_NOCOLOR) $(GINKGO_ARGS) ./test/e2e -- \
-e2e.artifacts-folder="$(ARTIFACTS)" \
Expand Down
7 changes: 5 additions & 2 deletions azure/services/agentpools/agentpools.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package agentpools
import (
"context"
"fmt"
"time"

"github.com/Azure/azure-sdk-for-go/services/containerservice/mgmt/2021-05-01/containerservice"
"github.com/go-logr/logr"
Expand Down Expand Up @@ -90,15 +91,17 @@ func (s *Service) Reconcile(ctx context.Context) error {
isCreate := azure.ResourceNotFound(err)
if isCreate {
err = s.Client.CreateOrUpdate(ctx, agentPoolSpec.ResourceGroup, agentPoolSpec.Cluster, agentPoolSpec.Name, profile)
if err != nil {
if err != nil && azure.ResourceNotFound(err) {
return azure.WithTransientError(errors.Wrap(err, "agent pool dependent resource does not exist yet"), 20*time.Second)
} else if err != nil {
return errors.Wrap(err, "failed to create or update agent pool")
}
} else {
ps := *existingPool.ManagedClusterAgentPoolProfileProperties.ProvisioningState
if ps != string(infrav1alpha4.Canceled) && ps != string(infrav1alpha4.Failed) && ps != string(infrav1alpha4.Succeeded) {
msg := fmt.Sprintf("Unable to update existing agent pool in non terminal state. Agent pool must be in one of the following provisioning states: canceled, failed, or succeeded. Actual state: %s", ps)
log.V(2).Info(msg)
return errors.New(msg)
return azure.WithTransientError(errors.New(msg), 20*time.Second)
}

// Normalize individual agent pools to diff in case we need to update
Expand Down
10 changes: 6 additions & 4 deletions azure/services/agentpools/agentpools_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ func TestReconcile(t *testing.T) {
provisioningStatesToTest: []string{"Canceled", "Succeeded", "Failed"},
expectedError: "",
expect: func(m *mock_agentpools.MockClientMockRecorder, provisioningstate string) {
pv := provisioningstate
m.CreateOrUpdate(gomockinternal.AContext(), "my-rg", "my-cluster", "my-agentpool", gomock.Any()).Return(nil)
m.Get(gomockinternal.AContext(), "my-rg", "my-cluster", "my-agentpool").Return(containerservice.AgentPool{ManagedClusterAgentPoolProfileProperties: &containerservice.ManagedClusterAgentPoolProfileProperties{
ProvisioningState: &provisioningstate,
ProvisioningState: &pv,
}}, nil)
},
},
Expand All @@ -69,7 +70,7 @@ func TestReconcile(t *testing.T) {
Name: "my-agentpool",
},
provisioningStatesToTest: []string{"Deleting", "InProgress", "randomStringHere"},
expectedError: "Unable to update existing agent pool in non terminal state. Agent pool must be in one of the following provisioning states: canceled, failed, or succeeded. Actual state: randomStringHere",
expectedError: "Unable to update existing agent pool in non terminal state. Agent pool must be in one of the following provisioning states: canceled, failed, or succeeded. Actual state:",
expect: func(m *mock_agentpools.MockClientMockRecorder, provisioningstate string) {
m.Get(gomockinternal.AContext(), "my-rg", "my-cluster", "my-agentpool").Return(containerservice.AgentPool{ManagedClusterAgentPoolProfileProperties: &containerservice.ManagedClusterAgentPoolProfileProperties{
ProvisioningState: &provisioningstate,
Expand All @@ -80,8 +81,9 @@ func TestReconcile(t *testing.T) {

for _, tc := range provisioningstatetestcases {
for _, provisioningstate := range tc.provisioningStatesToTest {
t.Logf("Testing agentpool provision state: " + provisioningstate)
tc := tc
provisioningstate := provisioningstate
t.Logf("Testing agentpool provision state: " + provisioningstate)
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)
t.Parallel()
Expand Down Expand Up @@ -119,7 +121,7 @@ func TestReconcile(t *testing.T) {

err := s.Reconcile(context.TODO())
if tc.expectedError != "" {
g.Expect(err.Error()).To(HavePrefix(tc.expectedError))
g.Expect(err.Error()).To(ContainSubstring(tc.expectedError))
g.Expect(err.Error()).To(ContainSubstring(provisioningstate))
} else {
g.Expect(err).NotTo(HaveOccurred())
Expand Down
5 changes: 3 additions & 2 deletions azure/services/managedclusters/managedclusters.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net"
"time"

"github.com/Azure/azure-sdk-for-go/services/containerservice/mgmt/2021-05-01/containerservice"
"github.com/Azure/go-autorest/autorest/to"
Expand Down Expand Up @@ -159,7 +160,7 @@ func (s *Service) Reconcile(ctx context.Context) error {
existingMC, err := s.Client.Get(ctx, managedClusterSpec.ResourceGroupName, managedClusterSpec.Name)
// Transient or other failure not due to 404
if err != nil && !azure.ResourceNotFound(err) {
return errors.Wrap(err, "failed to fetch existing managed cluster")
return azure.WithTransientError(errors.Wrap(err, "failed to fetch existing managed cluster"), 20*time.Second)
}

// We are creating this cluster for the first time.
Expand Down Expand Up @@ -300,7 +301,7 @@ func (s *Service) Reconcile(ctx context.Context) error {
if ps != string(infrav1alpha4.Canceled) && ps != string(infrav1alpha4.Failed) && ps != string(infrav1alpha4.Succeeded) {
msg := fmt.Sprintf("Unable to update existing managed cluster in non terminal state. Managed cluster must be in one of the following provisioning states: canceled, failed, or succeeded. Actual state: %s", ps)
klog.V(2).Infof(msg)
return errors.New(msg)
return azure.WithTransientError(errors.New(msg), 20*time.Second)
}

// Normalize the LoadBalancerProfile so the diff below doesn't get thrown off by AKS added properties.
Expand Down
2 changes: 1 addition & 1 deletion azure/services/managedclusters/managedclusters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func TestReconcile(t *testing.T) {
err := s.Reconcile(context.TODO())
if tc.expectedError != "" {
g.Expect(err).To(HaveOccurred())
g.Expect(err.Error()).To(HavePrefix(tc.expectedError))
g.Expect(err.Error()).To(ContainSubstring(tc.expectedError))
g.Expect(err.Error()).To(ContainSubstring(provisioningstate))
} else {
g.Expect(err).NotTo(HaveOccurred())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ spec:
of AzureManagedControlPlane.
properties:
initialized:
description: Initialized is true when the the control plane is available
description: Initialized is true when the control plane is available
for initial contact. This may occur before the control plane is
fully ready. In the AzureManagedControlPlane implementation, these
are identical.
Expand Down
2 changes: 1 addition & 1 deletion exp/api/v1beta1/azuremanagedcontrolplane_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ type AzureManagedControlPlaneStatus struct {
// +optional
Ready bool `json:"ready,omitempty"`

// Initialized is true when the the control plane is available for initial contact.
// Initialized is true when the control plane is available for initial contact.
// This may occur before the control plane is fully ready.
// In the AzureManagedControlPlane implementation, these are identical.
// +optional
Expand Down
36 changes: 27 additions & 9 deletions exp/controllers/azuremanagedcontrolplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/client-go/tools/record"
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/azure"
"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
Expand All @@ -37,14 +45,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
)

// AzureManagedControlPlaneReconciler reconciles an AzureManagedControlPlane object.
Expand Down Expand Up @@ -209,17 +209,35 @@ func (amcpr *AzureManagedControlPlaneReconciler) reconcileNormal(ctx context.Con
controllerutil.AddFinalizer(scope.ControlPlane, infrav1.ClusterFinalizer)
// Register the finalizer immediately to avoid orphaning Azure resources on delete
if err := scope.PatchObject(ctx); err != nil {
amcpr.Recorder.Eventf(scope.ControlPlane, corev1.EventTypeWarning, "AzureManagedControlPlane unavailable", "failed to patch resource: %s", err)
return reconcile.Result{}, err
}

if err := newAzureManagedControlPlaneReconciler(scope).Reconcile(ctx); err != nil {
// Handle transient and terminal errors
log := scope.WithValues("name", scope.ControlPlane.Name, "namespace", scope.ControlPlane.Namespace)
var reconcileError azure.ReconcileError
if errors.As(err, &reconcileError) {
if reconcileError.IsTerminal() {
log.Error(err, "failed to reconcile AzureManagedControlPlane")
return reconcile.Result{}, nil
}

if reconcileError.IsTransient() {
log.V(4).Info("requeuing due to transient transient failure", "error", err)
return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
}

return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureManagedControlPlane")
}

return reconcile.Result{}, errors.Wrapf(err, "error creating AzureManagedControlPlane %s/%s", scope.ControlPlane.Namespace, scope.ControlPlane.Name)
}

// No errors, so mark us ready so the Cluster API Cluster Controller can pull it
scope.ControlPlane.Status.Ready = true
scope.ControlPlane.Status.Initialized = true

amcpr.Recorder.Event(scope.ControlPlane, corev1.EventTypeNormal, "AzureManagedControlPlane available", "successfully reconciled")
return reconcile.Result{}, nil
}

Expand Down
48 changes: 30 additions & 18 deletions exp/controllers/azuremanagedmachinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,17 @@ import (

"github.com/go-logr/logr"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/client-go/tools/record"
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/azure"
"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
Expand All @@ -35,14 +44,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
)

// AzureManagedMachinePoolReconciler reconciles an AzureManagedMachinePool object.
Expand Down Expand Up @@ -188,10 +189,10 @@ func (ammpr *AzureManagedMachinePoolReconciler) Reconcile(ctx context.Context, r
return reconcile.Result{}, err
}

// For non-system node pools, we wait for the control plane to be
// initialized, otherwise Azure API will return an error for node pool
// CreateOrUpdate request.
if infraPool.Spec.Mode != string(infrav1exp.NodePoolModeSystem) && !controlPlane.Status.Initialized {
// Upon first create of an AKS service, the node pools are provided to the CreateOrUpdate call. After the initial
// create of the control plane and node pools, the control plane will transition to initialized. After the control
// plane is initialized, we can proceed to reconcile managed machine pools.
if !controlPlane.Status.Initialized {
log.Info("AzureManagedControlPlane is not initialized")
return reconcile.Result{}, nil
}
Expand Down Expand Up @@ -240,18 +241,29 @@ func (ammpr *AzureManagedMachinePoolReconciler) reconcileNormal(ctx context.Cont
}

if err := ammpr.createAzureManagedMachinePoolService(scope).Reconcile(ctx); err != nil {
if IsAgentPoolVMSSNotFoundError(err) {
// if the underlying VMSS is not yet created, requeue for 30s in the future
return reconcile.Result{
RequeueAfter: 30 * time.Second,
}, nil
// Handle transient and terminal errors
log := scope.WithValues("name", scope.InfraMachinePool.Name, "namespace", scope.InfraMachinePool.Namespace)
var reconcileError azure.ReconcileError
if errors.As(err, &reconcileError) {
if reconcileError.IsTerminal() {
log.Error(err, "failed to reconcile AzureManagedMachinePool")
return reconcile.Result{}, nil
}

if reconcileError.IsTransient() {
log.V(4).Info("requeuing due to transient transient failure", "error", err)
return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
}

return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureManagedMachinePool")
}

return reconcile.Result{}, errors.Wrapf(err, "error creating AzureManagedMachinePool %s/%s", scope.InfraMachinePool.Namespace, scope.InfraMachinePool.Name)
}

// No errors, so mark us ready so the Cluster API Cluster Controller can pull it
scope.InfraMachinePool.Status.Ready = true

ammpr.Recorder.Eventf(scope.InfraMachinePool, corev1.EventTypeNormal, "AzureManagedMachinePool available", "agent pool successfully reconciled")
return reconcile.Result{}, nil
}

Expand Down
12 changes: 2 additions & 10 deletions exp/controllers/azuremanagedmachinepool_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"strings"
"time"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2021-04-01/compute"
"github.com/pkg/errors"
Expand Down Expand Up @@ -52,10 +53,6 @@ type (
}
)

var (
notFoundErr = new(AgentPoolVMSSNotFoundError)
)

// NewAgentPoolVMSSNotFoundError creates a new AgentPoolVMSSNotFoundError.
func NewAgentPoolVMSSNotFoundError(nodeResourceGroup, poolName string) *AgentPoolVMSSNotFoundError {
return &AgentPoolVMSSNotFoundError{
Expand Down Expand Up @@ -113,7 +110,7 @@ func (s *azureManagedMachinePoolService) Reconcile(ctx context.Context) error {
}

if match == nil {
return NewAgentPoolVMSSNotFoundError(nodeResourceGroup, agentPoolName)
return azure.WithTransientError(NewAgentPoolVMSSNotFoundError(nodeResourceGroup, agentPoolName), 20*time.Second)
}

instances, err := s.scaleSetsSvc.ListInstances(ctx, nodeResourceGroup, *match.Name)
Expand Down Expand Up @@ -145,8 +142,3 @@ func (s *azureManagedMachinePoolService) Delete(ctx context.Context) error {

return nil
}

// IsAgentPoolVMSSNotFoundError returns true if the error is an AgentPoolVMSSNotFoundError.
func IsAgentPoolVMSSNotFoundError(err error) bool {
return errors.Is(err, notFoundErr)
}
2 changes: 1 addition & 1 deletion exp/controllers/azuremanagedmachinepool_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func TestIsAgentPoolVMSSNotFoundError(t *testing.T) {
t.Run(c.Name, func(t *testing.T) {
t.Parallel()
g := gomega.NewWithT(t)
g.Expect(IsAgentPoolVMSSNotFoundError(c.Err)).To(gomega.Equal(c.Expected))
g.Expect(errors.Is(c.Err, NewAgentPoolVMSSNotFoundError("foo", "baz"))).To(gomega.Equal(c.Expected))
})
}
}
2 changes: 0 additions & 2 deletions scripts/ci-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ mkdir -p "${REPO_ROOT}/hack/tools/bin"
KUBECTL=$(realpath hack/tools/bin/kubectl)
make "${KUBECTL}" &>/dev/null

# shellcheck source=hack/ensure-kustomize.sh
source "${REPO_ROOT}/hack/ensure-kustomize.sh"
# shellcheck source=hack/ensure-tags.sh
source "${REPO_ROOT}/hack/ensure-tags.sh"
# shellcheck source=hack/parse-prow-creds.sh
Expand Down
3 changes: 2 additions & 1 deletion test/e2e/azure_timesync.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func AzureTimeSyncSpec(ctx context.Context, inputGetter func() AzureTimeSyncSpec
}

// May need to break this up on clusters with larger number of nodes. There is a 10 ssh connection default.
if s.IsWindows {
if machineInfo.IsWindows {
//skip for now
Logf("Skipping windows time sync check. TODO: re-enable and check w32t service is running. Issue #1782")
} else {
Expand Down Expand Up @@ -199,6 +199,7 @@ func AzureDaemonsetTimeSyncSpec(ctx context.Context, inputGetter func() AzureTim
nsenterCmd := []string{"nsenter", "--target", "1", "--mount=/proc/1/ns/mnt", "--", "/bin/sh", "-c"}
var testFuncs []func() error
for key := range execInfo {
key := key
s := execInfo[key]
Byf("checking that time synchronization is healthy on %s", s.Hostname)
execToStringFn := func(expected, command string) func() error {
Expand Down

0 comments on commit 6189796

Please sign in to comment.