From f4ec70382590d21ea262784e2ecb457f22628ca5 Mon Sep 17 00:00:00 2001 From: Amine Date: Wed, 21 Feb 2024 18:46:04 -0600 Subject: [PATCH] Resolve race condition between CARM ConfigMap and reconciler for annotated namespaces (#138) Addresses https://github.com/aws-controllers-k8s/community/issues/2011 In certain scenarios, where a user deploys a resource to a namespace annotated with a specific ownner accountID, a race condition was identified between the reconciler and the CARM (Cross Account Resource Management) `ConfigMap`. This race condition resulted in the controller setting an empty roleARN, preventing the aws-sdk-go client from pivoting (calling `STS::AssumeRole`) and managing resourecs in the correct account. Instead, resources were inadvertently managed in the default account instead of the namespace assigned account. This issue stemmed from the initial implementation of the CARM feature, where the method responsible for retrieving the accountID from the cache, didn't not properly verify the existance and content of the CARM configMap and instead returned an empty stringy when these conditions were not satisfied. This led to selection of the default account (when an empty `RoleARN` is returned )for resource management. Although these scenarios are rare, they can occur in clusters with a significantly high number of namespaces, causing a delay between naemsapce/configmap events and the informer's event handlers. This patch addresses the race issue by implementing two main things: - Proper error propagation: an error is no propagated when a `ConfigMap` is missing or when an accountID entry is missing in the `ConfigMap`. This helps the reconciler make the right decision on how to handle these cases. - Improved error handling: The reconciler now carefully handles these errors and requeues whenever a user has issued an owneraccountid-annotated namespace but the Configmap is not create or properly propagated. Signed-off-by: Amine Hilaly By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --- pkg/runtime/adoption_reconciler.go | 40 ++++++++++++---- pkg/runtime/cache/account.go | 59 +++++++++++++++++++----- pkg/runtime/cache/account_test.go | 74 +++++++++++++++++++++++------- pkg/runtime/reconciler.go | 57 +++++++++++++++++++---- 4 files changed, 185 insertions(+), 45 deletions(-) diff --git a/pkg/runtime/adoption_reconciler.go b/pkg/runtime/adoption_reconciler.go index a891ebe..47ab8d9 100644 --- a/pkg/runtime/adoption_reconciler.go +++ b/pkg/runtime/adoption_reconciler.go @@ -15,6 +15,7 @@ package runtime import ( "context" + "fmt" "github.com/go-logr/logr" "github.com/pkg/errors" @@ -107,10 +108,28 @@ func (r *adoptionReconciler) reconcile(ctx context.Context, req ctrlrt.Request) return ackerr.NotAdoptable } - targetDescriptor := rmf.ResourceDescriptor() - acctID := r.getOwnerAccountID(res) + // If a user has specified a namespace that is annotated with the + // an owner account ID, we need an appropriate role ARN to assume + // in order to perform the reconciliation. The roles ARN are typically + // stored in a ConfigMap in the ACK system namespace. + // If the ConfigMap is not created, or not populated with an + // accountID to roleARN mapping, we need to properly requeue with a + // helpful message to the user. + var roleARN ackv1alpha1.AWSResourceName + acctID, needCARMLookup := r.getOwnerAccountID(res) + if needCARMLookup { + // This means that the user is specifying a namespace that is + // annotated with an owner account ID. We need to retrieve the + // roleARN from the ConfigMap and properly requeue if the roleARN + // is not available. + roleARN, err = r.getRoleARN(acctID) + if err != nil { + // r.getRoleARN errors are not terminal, we should requeue. + return requeue.NeededAfter(err, roleARNNotAvailableRequeueDelay) + } + } region := r.getRegion(res) - roleARN := r.getRoleARN(acctID) + targetDescriptor := rmf.ResourceDescriptor() endpointURL := r.getEndpointURL(res) gvk := targetDescriptor.GroupVersionKind() @@ -428,16 +447,16 @@ func (r *adoptionReconciler) handleReconcileError(err error) (ctrlrt.Result, err // that the service controller is in. func (r *adoptionReconciler) getOwnerAccountID( res *ackv1alpha1.AdoptedResource, -) ackv1alpha1.AWSAccountID { +) (ackv1alpha1.AWSAccountID, bool) { // look for owner account id in the namespace annotations namespace := res.GetNamespace() accID, ok := r.cache.Namespaces.GetOwnerAccountID(namespace) if ok { - return ackv1alpha1.AWSAccountID(accID) + return ackv1alpha1.AWSAccountID(accID), true } // use controller configuration - return ackv1alpha1.AWSAccountID(r.cfg.AccountID) + return ackv1alpha1.AWSAccountID(r.cfg.AccountID), false } // getEndpointURL returns the AWS account that owns the supplied resource. @@ -462,9 +481,12 @@ func (r *adoptionReconciler) getEndpointURL( // the resources. func (r *adoptionReconciler) getRoleARN( acctID ackv1alpha1.AWSAccountID, -) ackv1alpha1.AWSResourceName { - roleARN, _ := r.cache.Accounts.GetAccountRoleARN(string(acctID)) - return ackv1alpha1.AWSResourceName(roleARN) +) (ackv1alpha1.AWSResourceName, error) { + roleARN, err := r.cache.Accounts.GetAccountRoleARN(string(acctID)) + if err != nil { + return "", fmt.Errorf("unable to retrieve role ARN for account %s: %v", acctID, err) + } + return ackv1alpha1.AWSResourceName(roleARN), nil } // getRegion returns the AWS region that the given resource is in or should be diff --git a/pkg/runtime/cache/account.go b/pkg/runtime/cache/account.go index 812df66..8f275e2 100644 --- a/pkg/runtime/cache/account.go +++ b/pkg/runtime/cache/account.go @@ -14,6 +14,7 @@ package cache import ( + "errors" "sync" "github.com/go-logr/logr" @@ -23,6 +24,18 @@ import ( k8scache "k8s.io/client-go/tools/cache" ) +var ( + // ErrCARMConfigMapNotFound is an error that is returned when the CARM + // configmap is not found. + ErrCARMConfigMapNotFound = errors.New("CARM configmap not found") + // ErrAccountIDNotFound is an error that is returned when the account ID + // is not found in the CARM configmap. + ErrAccountIDNotFound = errors.New("account ID not found in CARM configmap") + // ErrEmptyRoleARN is an error that is returned when the role ARN is empty + // in the CARM configmap. + ErrEmptyRoleARN = errors.New("role ARN is empty in CARM configmap") +) + const ( // ACKRoleAccountMap is the name of the configmap map object storing // all the AWS Account IDs associated with their AWS Role ARNs. @@ -34,15 +47,17 @@ const ( // make the changes accordingly. type AccountCache struct { sync.RWMutex - log logr.Logger - roleARNs map[string]string + log logr.Logger + roleARNs map[string]string + configMapCreated bool } // NewAccountCache instanciate a new AccountCache. func NewAccountCache(log logr.Logger) *AccountCache { return &AccountCache{ - log: log.WithName("cache.account"), - roleARNs: make(map[string]string), + log: log.WithName("cache.account"), + roleARNs: make(map[string]string), + configMapCreated: false, } } @@ -55,6 +70,7 @@ func resourceMatchACKRoleAccountsConfigMap(raw interface{}) bool { // Run instantiate a new SharedInformer for ConfigMaps and runs it to begin processing items. func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{}) { + c.log.V(1).Info("Starting shared informer for accounts cache", "targetConfigMap", ACKRoleAccountMap) informer := informersv1.NewConfigMapInformer( clientSet, ackSystemNamespace, @@ -66,7 +82,10 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{ if resourceMatchACKRoleAccountsConfigMap(obj) { cm := obj.(*corev1.ConfigMap) object := cm.DeepCopy() - c.updateAccountRoleData(object.Data) + // To avoid multiple mutex locks, we are updating the cache + // and the configmap existence flag in the same function. + configMapCreated := true + c.updateAccountRoleData(configMapCreated, object.Data) c.log.V(1).Info("created account config map", "name", cm.ObjectMeta.Name) } }, @@ -75,7 +94,7 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{ cm := desired.(*corev1.ConfigMap) object := cm.DeepCopy() //TODO(a-hilaly): compare data checksum before updating the cache - c.updateAccountRoleData(object.Data) + c.updateAccountRoleData(true, object.Data) c.log.V(1).Info("updated account config map", "name", cm.ObjectMeta.Name) } }, @@ -83,7 +102,10 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{ if resourceMatchACKRoleAccountsConfigMap(obj) { cm := obj.(*corev1.ConfigMap) newMap := make(map[string]string) - c.updateAccountRoleData(newMap) + // To avoid multiple mutex locks, we are updating the cache + // and the configmap existence flag in the same function. + configMapCreated := false + c.updateAccountRoleData(configMapCreated, newMap) c.log.V(1).Info("deleted account config map", "name", cm.ObjectMeta.Name) } }, @@ -92,17 +114,32 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{ } // GetAccountRoleARN queries the AWS accountID associated Role ARN -// from the cached CARM configmap. This function is thread safe. -func (c *AccountCache) GetAccountRoleARN(accountID string) (string, bool) { +// from the cached CARM configmap. It will return an error if the +// configmap is not found, the accountID is not found or the role ARN +// is empty. +// +// This function is thread safe. +func (c *AccountCache) GetAccountRoleARN(accountID string) (string, error) { c.RLock() defer c.RUnlock() + + if !c.configMapCreated { + return "", ErrCARMConfigMapNotFound + } roleARN, ok := c.roleARNs[accountID] - return roleARN, ok && roleARN != "" + if !ok { + return "", ErrAccountIDNotFound + } + if roleARN == "" { + return "", ErrEmptyRoleARN + } + return roleARN, nil } // updateAccountRoleData updates the CARM map. This function is thread safe. -func (c *AccountCache) updateAccountRoleData(data map[string]string) { +func (c *AccountCache) updateAccountRoleData(exist bool, data map[string]string) { c.Lock() defer c.Unlock() c.roleARNs = data + c.configMapCreated = exist } diff --git a/pkg/runtime/cache/account_test.go b/pkg/runtime/cache/account_test.go index 948c7c9..9ae24cb 100644 --- a/pkg/runtime/cache/account_test.go +++ b/pkg/runtime/cache/account_test.go @@ -37,11 +37,14 @@ const ( testAccountARN1 = "arn:aws:iam::012345678912:role/S3Access" testAccount2 = "219876543210" testAccountARN2 = "arn:aws:iam::012345678912:role/root" + testAccount3 = "321987654321" + testAccountARN3 = "" ) func TestAccountCache(t *testing.T) { accountsMap1 := map[string]string{ testAccount1: testAccountARN1, + testAccount3: testAccountARN3, } accountsMap2 := map[string]string{ @@ -65,8 +68,14 @@ func TestAccountCache(t *testing.T) { stopCh := make(chan struct{}) accountCache.Run(k8sClient, stopCh) + // Before creating the configmap, the accountCache should error for any + // GetAccountRoleARN call. + _, err := accountCache.GetAccountRoleARN(testAccount1) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) + // Test create events - _, err := k8sClient.CoreV1().ConfigMaps(testNamespace).Create( + _, err = k8sClient.CoreV1().ConfigMaps(testNamespace).Create( context.Background(), &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -80,8 +89,15 @@ func TestAccountCache(t *testing.T) { time.Sleep(time.Second) - _, ok := accountCache.GetAccountRoleARN("random-account") - require.False(t, ok) + // Test with non existing account + _, err = accountCache.GetAccountRoleARN("random-account-not-exist") + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) + + // Test with existing account + _, err = accountCache.GetAccountRoleARN(testAccount1) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) k8sClient.CoreV1().ConfigMaps(testNamespace).Create( context.Background(), @@ -97,12 +113,20 @@ func TestAccountCache(t *testing.T) { time.Sleep(time.Second) - roleARN, ok := accountCache.GetAccountRoleARN(testAccount1) - require.True(t, ok) - require.Equal(t, roleARN, testAccountARN1) + // Test with non existing account + _, err = accountCache.GetAccountRoleARN("random-account-not-exist") + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrAccountIDNotFound) - _, ok = accountCache.GetAccountRoleARN(testAccount2) - require.False(t, ok) + // Test with existing account - but role ARN is empty + _, err = accountCache.GetAccountRoleARN(testAccount3) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrEmptyRoleARN) + + // Test with existing account + roleARN, err := accountCache.GetAccountRoleARN(testAccount1) + require.Nil(t, err) + require.Equal(t, roleARN, testAccountARN1) // Test update events k8sClient.CoreV1().ConfigMaps("ack-system").Update( @@ -119,12 +143,23 @@ func TestAccountCache(t *testing.T) { time.Sleep(time.Second) - roleARN, ok = accountCache.GetAccountRoleARN(testAccount1) - require.True(t, ok) + // Test with non existing account + _, err = accountCache.GetAccountRoleARN("random-account-not-exist") + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrAccountIDNotFound) + + // Test that account was removed + _, err = accountCache.GetAccountRoleARN(testAccount3) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrAccountIDNotFound) + + // Test with existing account + roleARN, err = accountCache.GetAccountRoleARN(testAccount1) + require.Nil(t, err) require.Equal(t, roleARN, testAccountARN1) - roleARN, ok = accountCache.GetAccountRoleARN(testAccount2) - require.True(t, ok) + roleARN, err = accountCache.GetAccountRoleARN(testAccount2) + require.Nil(t, err) require.Equal(t, roleARN, testAccountARN2) // Test delete events @@ -136,9 +171,16 @@ func TestAccountCache(t *testing.T) { time.Sleep(time.Second) - _, ok = accountCache.GetAccountRoleARN(testAccount1) - require.False(t, ok) - _, ok = accountCache.GetAccountRoleARN(testAccount2) - require.False(t, ok) + // Test that accounts ware removed + _, err = accountCache.GetAccountRoleARN(testAccount1) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) + + _, err = accountCache.GetAccountRoleARN(testAccount2) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) + _, err = accountCache.GetAccountRoleARN(testAccount3) + require.NotNil(t, err) + require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound) } diff --git a/pkg/runtime/reconciler.go b/pkg/runtime/reconciler.go index d143bc5..de42ee0 100644 --- a/pkg/runtime/reconciler.go +++ b/pkg/runtime/reconciler.go @@ -50,6 +50,10 @@ const ( // the successful reconciliation. This behavior for a resource can be // overriden by RequeueOnSuccessSeconds configuration for that resource. defaultResyncPeriod = 10 * time.Hour + // roleARNNotAvailableRequeueDelay is the default duration to requeue the + // resource if the CARM cache is not synced yet, or if the roleARN is not + // available. + roleARNNotAvailableRequeueDelay = 15 * time.Second ) // reconciler describes a generic reconciler within ACK. @@ -192,11 +196,31 @@ func (r *resourceReconciler) Reconcile(ctx context.Context, req ctrlrt.Request) return ctrlrt.Result{}, err } - acctID := r.getOwnerAccountID(desired) + // If a user has specified a namespace that is annotated with the + // an owner account ID, we need an appropriate role ARN to assume + // in order to perform the reconciliation. The roles ARN are typically + // stored in a ConfigMap in the ACK system namespace. + // If the ConfigMap is not created, or not populated with an + // accountID to roleARN mapping, we need to properly requeue with a + // helpful message to the user. + var roleARN ackv1alpha1.AWSResourceName + acctID, needCARMLookup := r.getOwnerAccountID(desired) + if needCARMLookup { + // This means that the user is specifying a namespace that is + // annotated with an owner account ID. We need to retrieve the + // roleARN from the ConfigMap and properly requeue if the roleARN + // is not available. + roleARN, err = r.getRoleARN(acctID) + if err != nil { + // r.getRoleARN errors are not terminal, we should requeue. + return ctrlrt.Result{}, requeue.NeededAfter(err, roleARNNotAvailableRequeueDelay) + } + } region := r.getRegion(desired) - roleARN := r.getRoleARN(acctID) + endpointURL := r.getEndpointURL(desired) gvk := r.rd.GroupVersionKind() + // New session will only pivot to the roleARN if it is not empty. sess, err := r.sc.NewSession(region, &endpointURL, roleARN, gvk) if err != nil { return ctrlrt.Result{}, err @@ -1008,32 +1032,47 @@ func (r *resourceReconciler) HandleReconcileError( // by the default AWS account ID associated with the Kubernetes Namespace in // which the CR was created, followed by the AWS Account in which the IAM Role // that the service controller is in. +// +// This function is also returning a boolean stating whether the account ID +// is retrieved from the namespace annotations. This information is used to +// determine whether the a role ARN should be assumed to manage the resource, +// which is typically found in the CARM ConfigMap. +// +// If the returned boolean is true, it means that the resource is owned by +// a different account than the controller's default account ID, and the +// controller should lookup the CARM ConfigMap. func (r *resourceReconciler) getOwnerAccountID( res acktypes.AWSResource, -) ackv1alpha1.AWSAccountID { +) (ackv1alpha1.AWSAccountID, bool) { + controllerAccountID := ackv1alpha1.AWSAccountID(r.cfg.AccountID) + + // look for owner account id in the resource status acctID := res.Identifiers().OwnerAccountID() if acctID != nil { - return *acctID + return *acctID, *acctID != controllerAccountID } // look for owner account id in the namespace annotations namespace := res.MetaObject().GetNamespace() accID, ok := r.cache.Namespaces.GetOwnerAccountID(namespace) if ok { - return ackv1alpha1.AWSAccountID(accID) + return ackv1alpha1.AWSAccountID(accID), true } // use controller configuration - return ackv1alpha1.AWSAccountID(r.cfg.AccountID) + return controllerAccountID, false } // getRoleARN return the Role ARN that should be assumed in order to manage // the resources. func (r *resourceReconciler) getRoleARN( acctID ackv1alpha1.AWSAccountID, -) ackv1alpha1.AWSResourceName { - roleARN, _ := r.cache.Accounts.GetAccountRoleARN(string(acctID)) - return ackv1alpha1.AWSResourceName(roleARN) +) (ackv1alpha1.AWSResourceName, error) { + roleARN, err := r.cache.Accounts.GetAccountRoleARN(string(acctID)) + if err != nil { + return "", fmt.Errorf("unable to retrieve role ARN for account %s: %v", acctID, err) + } + return ackv1alpha1.AWSResourceName(roleARN), nil } // getRegion returns the region the resource exists in, or if the resource