Skip to content

Commit

Permalink
Resolve race condition between CARM ConfigMap and reconciler for anno…
Browse files Browse the repository at this point in the history
…tated namespaces (aws-controllers-k8s#138)

Addresses aws-controllers-k8s/community#2011

In certain scenarios, where a user deploys a resource to a namespace
annotated with a specific ownner accountID, a race condition was
identified between the reconciler and the CARM (Cross Account Resource
Management) `ConfigMap`. This race condition resulted in the controller
setting an empty roleARN, preventing the aws-sdk-go client from pivoting
(calling `STS::AssumeRole`) and managing resourecs in the correct
account. Instead, resources were inadvertently managed in the default
account instead of the namespace assigned account.

This issue stemmed from the initial implementation of the CARM feature,
where the method responsible for retrieving the accountID from the
cache, didn't not properly verify the existance and content of the CARM
configMap and instead returned an empty stringy when these conditions
were not satisfied. This led to selection of the default account (when an
empty `RoleARN` is returned )for resource management.

Although these scenarios are rare, they can occur in clusters with a
significantly high number of namespaces, causing a delay between
naemsapce/configmap events and the informer's event handlers.

This patch addresses the race issue by implementing two main things:
- Proper error propagation: an error is no propagated when a `ConfigMap`
  is missing or when an accountID entry is missing in the `ConfigMap`.
  This helps the reconciler make the right decision on how to handle
  these cases.
- Improved error handling: The reconciler now carefully handles these
  errors and requeues whenever a user has issued an
  owneraccountid-annotated namespace but the Configmap is not create or
  properly propagated.

Signed-off-by: Amine Hilaly <[email protected]>

By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
  • Loading branch information
a-hilaly authored and ndbhat committed Apr 16, 2024
1 parent 13e5c30 commit f4ec703
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 45 deletions.
40 changes: 31 additions & 9 deletions pkg/runtime/adoption_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package runtime

import (
"context"
"fmt"

"github.com/go-logr/logr"
"github.com/pkg/errors"
Expand Down Expand Up @@ -107,10 +108,28 @@ func (r *adoptionReconciler) reconcile(ctx context.Context, req ctrlrt.Request)
return ackerr.NotAdoptable
}

targetDescriptor := rmf.ResourceDescriptor()
acctID := r.getOwnerAccountID(res)
// If a user has specified a namespace that is annotated with the
// an owner account ID, we need an appropriate role ARN to assume
// in order to perform the reconciliation. The roles ARN are typically
// stored in a ConfigMap in the ACK system namespace.
// If the ConfigMap is not created, or not populated with an
// accountID to roleARN mapping, we need to properly requeue with a
// helpful message to the user.
var roleARN ackv1alpha1.AWSResourceName
acctID, needCARMLookup := r.getOwnerAccountID(res)
if needCARMLookup {
// This means that the user is specifying a namespace that is
// annotated with an owner account ID. We need to retrieve the
// roleARN from the ConfigMap and properly requeue if the roleARN
// is not available.
roleARN, err = r.getRoleARN(acctID)
if err != nil {
// r.getRoleARN errors are not terminal, we should requeue.
return requeue.NeededAfter(err, roleARNNotAvailableRequeueDelay)
}
}
region := r.getRegion(res)
roleARN := r.getRoleARN(acctID)
targetDescriptor := rmf.ResourceDescriptor()
endpointURL := r.getEndpointURL(res)
gvk := targetDescriptor.GroupVersionKind()

Expand Down Expand Up @@ -428,16 +447,16 @@ func (r *adoptionReconciler) handleReconcileError(err error) (ctrlrt.Result, err
// that the service controller is in.
func (r *adoptionReconciler) getOwnerAccountID(
res *ackv1alpha1.AdoptedResource,
) ackv1alpha1.AWSAccountID {
) (ackv1alpha1.AWSAccountID, bool) {
// look for owner account id in the namespace annotations
namespace := res.GetNamespace()
accID, ok := r.cache.Namespaces.GetOwnerAccountID(namespace)
if ok {
return ackv1alpha1.AWSAccountID(accID)
return ackv1alpha1.AWSAccountID(accID), true
}

// use controller configuration
return ackv1alpha1.AWSAccountID(r.cfg.AccountID)
return ackv1alpha1.AWSAccountID(r.cfg.AccountID), false
}

// getEndpointURL returns the AWS account that owns the supplied resource.
Expand All @@ -462,9 +481,12 @@ func (r *adoptionReconciler) getEndpointURL(
// the resources.
func (r *adoptionReconciler) getRoleARN(
acctID ackv1alpha1.AWSAccountID,
) ackv1alpha1.AWSResourceName {
roleARN, _ := r.cache.Accounts.GetAccountRoleARN(string(acctID))
return ackv1alpha1.AWSResourceName(roleARN)
) (ackv1alpha1.AWSResourceName, error) {
roleARN, err := r.cache.Accounts.GetAccountRoleARN(string(acctID))
if err != nil {
return "", fmt.Errorf("unable to retrieve role ARN for account %s: %v", acctID, err)
}
return ackv1alpha1.AWSResourceName(roleARN), nil
}

// getRegion returns the AWS region that the given resource is in or should be
Expand Down
59 changes: 48 additions & 11 deletions pkg/runtime/cache/account.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package cache

import (
"errors"
"sync"

"github.com/go-logr/logr"
Expand All @@ -23,6 +24,18 @@ import (
k8scache "k8s.io/client-go/tools/cache"
)

var (
// ErrCARMConfigMapNotFound is an error that is returned when the CARM
// configmap is not found.
ErrCARMConfigMapNotFound = errors.New("CARM configmap not found")
// ErrAccountIDNotFound is an error that is returned when the account ID
// is not found in the CARM configmap.
ErrAccountIDNotFound = errors.New("account ID not found in CARM configmap")
// ErrEmptyRoleARN is an error that is returned when the role ARN is empty
// in the CARM configmap.
ErrEmptyRoleARN = errors.New("role ARN is empty in CARM configmap")
)

const (
// ACKRoleAccountMap is the name of the configmap map object storing
// all the AWS Account IDs associated with their AWS Role ARNs.
Expand All @@ -34,15 +47,17 @@ const (
// make the changes accordingly.
type AccountCache struct {
sync.RWMutex
log logr.Logger
roleARNs map[string]string
log logr.Logger
roleARNs map[string]string
configMapCreated bool
}

// NewAccountCache instanciate a new AccountCache.
func NewAccountCache(log logr.Logger) *AccountCache {
return &AccountCache{
log: log.WithName("cache.account"),
roleARNs: make(map[string]string),
log: log.WithName("cache.account"),
roleARNs: make(map[string]string),
configMapCreated: false,
}
}

Expand All @@ -55,6 +70,7 @@ func resourceMatchACKRoleAccountsConfigMap(raw interface{}) bool {

// Run instantiate a new SharedInformer for ConfigMaps and runs it to begin processing items.
func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{}) {
c.log.V(1).Info("Starting shared informer for accounts cache", "targetConfigMap", ACKRoleAccountMap)
informer := informersv1.NewConfigMapInformer(
clientSet,
ackSystemNamespace,
Expand All @@ -66,7 +82,10 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
if resourceMatchACKRoleAccountsConfigMap(obj) {
cm := obj.(*corev1.ConfigMap)
object := cm.DeepCopy()
c.updateAccountRoleData(object.Data)
// To avoid multiple mutex locks, we are updating the cache
// and the configmap existence flag in the same function.
configMapCreated := true
c.updateAccountRoleData(configMapCreated, object.Data)
c.log.V(1).Info("created account config map", "name", cm.ObjectMeta.Name)
}
},
Expand All @@ -75,15 +94,18 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
cm := desired.(*corev1.ConfigMap)
object := cm.DeepCopy()
//TODO(a-hilaly): compare data checksum before updating the cache
c.updateAccountRoleData(object.Data)
c.updateAccountRoleData(true, object.Data)
c.log.V(1).Info("updated account config map", "name", cm.ObjectMeta.Name)
}
},
DeleteFunc: func(obj interface{}) {
if resourceMatchACKRoleAccountsConfigMap(obj) {
cm := obj.(*corev1.ConfigMap)
newMap := make(map[string]string)
c.updateAccountRoleData(newMap)
// To avoid multiple mutex locks, we are updating the cache
// and the configmap existence flag in the same function.
configMapCreated := false
c.updateAccountRoleData(configMapCreated, newMap)
c.log.V(1).Info("deleted account config map", "name", cm.ObjectMeta.Name)
}
},
Expand All @@ -92,17 +114,32 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
}

// GetAccountRoleARN queries the AWS accountID associated Role ARN
// from the cached CARM configmap. This function is thread safe.
func (c *AccountCache) GetAccountRoleARN(accountID string) (string, bool) {
// from the cached CARM configmap. It will return an error if the
// configmap is not found, the accountID is not found or the role ARN
// is empty.
//
// This function is thread safe.
func (c *AccountCache) GetAccountRoleARN(accountID string) (string, error) {
c.RLock()
defer c.RUnlock()

if !c.configMapCreated {
return "", ErrCARMConfigMapNotFound
}
roleARN, ok := c.roleARNs[accountID]
return roleARN, ok && roleARN != ""
if !ok {
return "", ErrAccountIDNotFound
}
if roleARN == "" {
return "", ErrEmptyRoleARN
}
return roleARN, nil
}

// updateAccountRoleData updates the CARM map. This function is thread safe.
func (c *AccountCache) updateAccountRoleData(data map[string]string) {
func (c *AccountCache) updateAccountRoleData(exist bool, data map[string]string) {
c.Lock()
defer c.Unlock()
c.roleARNs = data
c.configMapCreated = exist
}
74 changes: 58 additions & 16 deletions pkg/runtime/cache/account_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ const (
testAccountARN1 = "arn:aws:iam::012345678912:role/S3Access"
testAccount2 = "219876543210"
testAccountARN2 = "arn:aws:iam::012345678912:role/root"
testAccount3 = "321987654321"
testAccountARN3 = ""
)

func TestAccountCache(t *testing.T) {
accountsMap1 := map[string]string{
testAccount1: testAccountARN1,
testAccount3: testAccountARN3,
}

accountsMap2 := map[string]string{
Expand All @@ -65,8 +68,14 @@ func TestAccountCache(t *testing.T) {
stopCh := make(chan struct{})
accountCache.Run(k8sClient, stopCh)

// Before creating the configmap, the accountCache should error for any
// GetAccountRoleARN call.
_, err := accountCache.GetAccountRoleARN(testAccount1)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)

// Test create events
_, err := k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
_, err = k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
context.Background(),
&corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -80,8 +89,15 @@ func TestAccountCache(t *testing.T) {

time.Sleep(time.Second)

_, ok := accountCache.GetAccountRoleARN("random-account")
require.False(t, ok)
// Test with non existing account
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)

// Test with existing account
_, err = accountCache.GetAccountRoleARN(testAccount1)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)

k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
context.Background(),
Expand All @@ -97,12 +113,20 @@ func TestAccountCache(t *testing.T) {

time.Sleep(time.Second)

roleARN, ok := accountCache.GetAccountRoleARN(testAccount1)
require.True(t, ok)
require.Equal(t, roleARN, testAccountARN1)
// Test with non existing account
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)

_, ok = accountCache.GetAccountRoleARN(testAccount2)
require.False(t, ok)
// Test with existing account - but role ARN is empty
_, err = accountCache.GetAccountRoleARN(testAccount3)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrEmptyRoleARN)

// Test with existing account
roleARN, err := accountCache.GetAccountRoleARN(testAccount1)
require.Nil(t, err)
require.Equal(t, roleARN, testAccountARN1)

// Test update events
k8sClient.CoreV1().ConfigMaps("ack-system").Update(
Expand All @@ -119,12 +143,23 @@ func TestAccountCache(t *testing.T) {

time.Sleep(time.Second)

roleARN, ok = accountCache.GetAccountRoleARN(testAccount1)
require.True(t, ok)
// Test with non existing account
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)

// Test that account was removed
_, err = accountCache.GetAccountRoleARN(testAccount3)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)

// Test with existing account
roleARN, err = accountCache.GetAccountRoleARN(testAccount1)
require.Nil(t, err)
require.Equal(t, roleARN, testAccountARN1)

roleARN, ok = accountCache.GetAccountRoleARN(testAccount2)
require.True(t, ok)
roleARN, err = accountCache.GetAccountRoleARN(testAccount2)
require.Nil(t, err)
require.Equal(t, roleARN, testAccountARN2)

// Test delete events
Expand All @@ -136,9 +171,16 @@ func TestAccountCache(t *testing.T) {

time.Sleep(time.Second)

_, ok = accountCache.GetAccountRoleARN(testAccount1)
require.False(t, ok)
_, ok = accountCache.GetAccountRoleARN(testAccount2)
require.False(t, ok)
// Test that accounts ware removed
_, err = accountCache.GetAccountRoleARN(testAccount1)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)

_, err = accountCache.GetAccountRoleARN(testAccount2)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)

_, err = accountCache.GetAccountRoleARN(testAccount3)
require.NotNil(t, err)
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
}
Loading

0 comments on commit f4ec703

Please sign in to comment.