Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kubernetes secret provider] Add cache for the secrets #3822

Merged
merged 29 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
dadc79b
Add secret cache
constanca-m Nov 27, 2023
ea3a100
Add changelog
constanca-m Nov 27, 2023
a71101b
Remove assignment
constanca-m Nov 27, 2023
125330f
Change TTL default to 1 min
constanca-m Nov 27, 2023
09ba2b6
- Remove secrets based on last access
constanca-m Nov 27, 2023
4908655
- Remove secrets based on last access
constanca-m Nov 27, 2023
a291bae
- Update duration config format
constanca-m Nov 27, 2023
9f5b92c
Add unit test
constanca-m Nov 27, 2023
9d94859
Add unit test
constanca-m Nov 27, 2023
6d30148
- Use assert.eventually
constanca-m Nov 28, 2023
d7c8756
- Fix typos
constanca-m Nov 28, 2023
b3ba219
- Move key validation to addToCache
constanca-m Nov 28, 2023
8913d04
fix race in tests
constanca-m Nov 28, 2023
0fcd65a
fix race in tests
constanca-m Nov 28, 2023
57ad12c
fix race in tests
constanca-m Nov 28, 2023
8533ec4
fix race in tests
constanca-m Nov 28, 2023
c65ae20
Rename TTLUpdate to refresh interval.
constanca-m Dec 4, 2023
87f0453
Add context timeout.
constanca-m Dec 5, 2023
f02823e
Switch reading lock to writing lock.
constanca-m Dec 6, 2023
97d6c84
Switch reading lock to writing lock.
constanca-m Dec 6, 2023
0f8d86a
- Add disable cache option
constanca-m Dec 8, 2023
70c16ca
Rename cache fields
constanca-m Dec 11, 2023
7f40d9c
Changes config names
constanca-m Dec 13, 2023
62862a3
Merge maps
constanca-m Dec 13, 2023
7642f09
Merge maps
constanca-m Dec 13, 2023
ab76805
Merge maps, fix mistake
constanca-m Dec 14, 2023
4f3ab48
Change locks to reading locks
constanca-m Dec 14, 2023
044e7d1
Change locks to reading locks
constanca-m Dec 14, 2023
8b7da6c
Remove read lock for iteration
constanca-m Dec 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions changelog/fragments/1701091034-add-cache-for-secrets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: feature

# Change summary; a 80ish characters long description of the change.
summary: add cache for secrets when using kubernetes secret provider

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3822

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/3594
14 changes: 13 additions & 1 deletion internal/pkg/composable/providers/kubernetessecrets/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,22 @@

package kubernetessecrets

import "github.com/elastic/elastic-agent-autodiscover/kubernetes"
import (
"time"

"github.com/elastic/elastic-agent-autodiscover/kubernetes"
)

// Config for kubernetes provider
type Config struct {
KubeConfig string `config:"kube_config"`
KubeClientOptions kubernetes.KubeClientOptions `config:"kube_client_options"`

TTLUpdate time.Duration `config:"ttl_update"`
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
TTLDelete time.Duration `config:"ttl_delete"`
}

func (c *Config) InitDefaults() {
c.TTLUpdate = 60 * time.Second
c.TTLDelete = 1 * time.Hour
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"context"
"strings"
"sync"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sclient "k8s.io/client-go/kubernetes"
Expand All @@ -33,6 +34,14 @@ type contextProviderK8sSecrets struct {

clientMx sync.Mutex
client k8sclient.Interface

secretsCacheMx sync.RWMutex
secretsCache map[string]*secretsData
}

type secretsData struct {
value string
lastAccess time.Time
}

// ContextProviderBuilder builds the context provider.
Expand All @@ -46,38 +55,152 @@ func ContextProviderBuilder(logger *logger.Logger, c *config.Config, managed boo
return nil, errors.New(err, "failed to unpack configuration")
}
return &contextProviderK8sSecrets{
logger: logger,
config: &cfg,
logger: logger,
config: &cfg,
secretsCache: make(map[string]*secretsData),
}, nil
}

func (p *contextProviderK8sSecrets) Fetch(key string) (string, bool) {
// key = "kubernetes_secrets.somenamespace.somesecret.value"
return p.getFromCache(key)
}

// Run initializes the k8s secrets context provider.
func (p *contextProviderK8sSecrets) Run(ctx context.Context, comm corecomp.ContextProviderComm) error {
client, err := getK8sClientFunc(p.config.KubeConfig, p.config.KubeClientOptions)
if err != nil {
p.logger.Debugf("Kubernetes_secrets provider skipped, unable to connect: %s", err)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
return nil
}
p.clientMx.Lock()
client := p.client
p.client = client
p.clientMx.Unlock()
if client == nil {
return "", false
go p.updateSecrets(ctx)

<-comm.Done()

p.clientMx.Lock()
p.client = nil
p.clientMx.Unlock()
return comm.Err()
}

func getK8sClient(kubeconfig string, opt kubernetes.KubeClientOptions) (k8sclient.Interface, error) {
return kubernetes.GetKubernetesClient(kubeconfig, opt)
}

// Update the secrets in the cache every TTL minutes
func (p *contextProviderK8sSecrets) updateSecrets(ctx context.Context) {
timer := time.NewTimer(p.config.TTLUpdate)
for {
select {
case <-ctx.Done():
return
case <-timer.C:
p.updateCache()
timer.Reset(p.config.TTLUpdate)
}
}
}
constanca-m marked this conversation as resolved.
Show resolved Hide resolved

func (p *contextProviderK8sSecrets) updateCache() {
p.secretsCacheMx.Lock()
// deleting entries does not free the memory, so we need to create a new map
// to place the secrets we want to keep
cacheTmp := make(map[string]*secretsData)
for name, data := range p.secretsCache {
diff := time.Since(data.lastAccess)
if diff > p.config.TTLDelete {
delete(p.secretsCache, name)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
} else {
newValue, ok := p.fetchSecret(name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another issue that I am seeing. The lock is being held when making API calls to fetch the secret. What is the worry that this blocks for a long time holding the lock and then causing the rendering of the policy to be blocked?

Is it possible to be multiple key values in a single API call? It might be better to change this to first get all the keys that new values are needed. Then make a single API call to get all the values (if possible), and then update the cache with the values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to be multiple key values in a single API call?

It is not possible. The only way to get multiple secrets at once is to use the list function instead of get here. However, the options to filter the secrets we return are these, and as we can see, there are none to filter by the name. So in the end we would be retrieving all the secrets from a namespace when there is no need.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dang. Okay, I am still worried about it holding the lock the entire time it is refreshing all the secrets.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does fetchSecrets have a timeout? What if you never get a response? What if you do get a response, but it takes one hour?

Now that cacheTmp is being rebuilt is holding the lock for each fetchSecret call still necessary? Can cacheTmp be built and then assigned to p.secretsCache with only that one line holding the lock?

Since we are iterating over secretsCache that would likely require duplicating the keys first.

However, the options to filter the secrets we return are these, and as we can see, there are none to filter by the name. So in the end we would be retrieving all the secrets from a namespace when there is no need.

What places more load on the k8s API? Returning a single response that is unnecessarily large or multiple small requests made rapidly one after another?

We should bias towards making sure the k8s api-server remains stable over optimizing the request size if possible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does fetchSecrets have a timeout? What if you never get a response? What if you do get a response, but it takes one hour?

I added a timeout just now, so we will never be waiting for 1h.

Now that cacheTmp is being rebuilt is holding the lock for each fetchSecret call still necessary?

I changed the implementation to this way:

  1. Created the temporary cache.
  2. Apply read lock and iterate over the entries of the secrets cache. If entry remains, then write on temporary cache.
  3. Now that the iteration is over, apply writing lock to secrets cache and make it the same as the temporary one.

Is this ok this way @cmacknz ?

if !ok {
delete(p.secretsCache, name)
} else {
data.value = newValue
cacheTmp[name] = data
}
}
}
p.secretsCache = cacheTmp
p.secretsCacheMx.Unlock()
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
}

func (p *contextProviderK8sSecrets) getFromCache(key string) (string, bool) {
p.secretsCacheMx.RLock()
_, ok := p.secretsCache[key]
p.secretsCacheMx.RUnlock()

// if value is still not present in cache, it is possible we haven't tried to fetch it yet
if !ok {
data, ok := p.addToCache(key)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
// if it was not possible to fetch the secret, return
if !ok {
return data.value, ok
}
}

var pass string
p.secretsCacheMx.Lock()
data, ok := p.secretsCache[key]
data.lastAccess = time.Now()
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
pass = data.value
p.secretsCacheMx.Unlock()
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
return pass, ok
}

func (p *contextProviderK8sSecrets) addToCache(key string) (secretsData, bool) {
// Make sure the key has the expected format "kubernetes_secrets.somenamespace.somesecret.value"
tokens := strings.Split(key, ".")
if len(tokens) > 0 && tokens[0] != "kubernetes_secrets" {
return "", false
return secretsData{
value: "",
}, false
}
if len(tokens) != 4 {
p.logger.Debugf(
"not valid secret key: %v. Secrets should be of the following format %v",
key,
"kubernetes_secrets.somenamespace.somesecret.value",
)
return secretsData{
value: "",
}, false
}

value, ok := p.fetchSecret(key)
data := secretsData{
value: value,
blakerouse marked this conversation as resolved.
Show resolved Hide resolved
}
if ok {
p.secretsCacheMx.Lock()
p.secretsCache[key] = &data
p.secretsCacheMx.Unlock()
}
return data, ok
}

func (p *contextProviderK8sSecrets) fetchSecret(key string) (string, bool) {
p.clientMx.Lock()
client := p.client
p.clientMx.Unlock()
if client == nil {
return "", false
}

tokens := strings.Split(key, ".")
// key has the format "kubernetes_secrets.somenamespace.somesecret.value"
// This function is only called from:
// - addToCache, where we already validated that the key has the right format.
// - updateCache, where the results are only added to the cache through addToCache
// Because of this we no longer need to validate the key
ns := tokens[1]
secretName := tokens[2]
secretVar := tokens[3]

secretIntefrace := client.CoreV1().Secrets(ns)
secretInterface := client.CoreV1().Secrets(ns)
ctx := context.TODO()
secret, err := secretIntefrace.Get(ctx, secretName, metav1.GetOptions{})
secret, err := secretInterface.Get(ctx, secretName, metav1.GetOptions{})
if err != nil {
p.logger.Errorf("Could not retrieve secret from k8s API: %v", err)
return "", false
Expand All @@ -87,26 +210,6 @@ func (p *contextProviderK8sSecrets) Fetch(key string) (string, bool) {
return "", false
}
secretString := secret.Data[secretVar]
return string(secretString), true
}

// Run initializes the k8s secrets context provider.
func (p *contextProviderK8sSecrets) Run(ctx context.Context, comm corecomp.ContextProviderComm) error {
client, err := getK8sClientFunc(p.config.KubeConfig, p.config.KubeClientOptions)
if err != nil {
p.logger.Debugf("Kubernetes_secrets provider skipped, unable to connect: %s", err)
return nil
}
p.clientMx.Lock()
p.client = client
p.clientMx.Unlock()
<-comm.Done()
p.clientMx.Lock()
p.client = nil
p.clientMx.Unlock()
return comm.Err()
}

func getK8sClient(kubeconfig string, opt kubernetes.KubeClientOptions) (k8sclient.Interface, error) {
return kubernetes.GetKubernetesClient(kubeconfig, opt)
return string(secretString), true
}
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,119 @@ func Test_K8sSecretsProvider_FetchWrongSecret(t *testing.T) {
assert.False(t, found)
assert.EqualValues(t, val, "")
}

func Test_K8sSecretsProvider_Check_TTL(t *testing.T) {
client := k8sfake.NewSimpleClientset()

ttlDelete, err := time.ParseDuration("1s")
require.NoError(t, err)

ttlUpdate, err := time.ParseDuration("100ms")
require.NoError(t, err)

secret := &v1.Secret{
TypeMeta: metav1.TypeMeta{
Kind: "Secret",
APIVersion: "apps/v1beta1",
},
ObjectMeta: metav1.ObjectMeta{
Name: "testing_secret",
Namespace: ns,
},
Data: map[string][]byte{
"secret_value": []byte(pass),
},
}
_, err = client.CoreV1().Secrets(ns).Create(context.Background(), secret, metav1.CreateOptions{})
require.NoError(t, err)

logger := logp.NewLogger("test_k8s_secrets")

c := map[string]interface{}{
"ttl_update": ttlUpdate,
"ttl_delete": ttlDelete,
}
cfg, err := config.NewConfigFrom(c)
require.NoError(t, err)

p, err := ContextProviderBuilder(logger, cfg, true)
require.NoError(t, err)

fp, _ := p.(*contextProviderK8sSecrets)

getK8sClientFunc = func(kubeconfig string, opt kubernetes.KubeClientOptions) (k8sclient.Interface, error) {
return client, nil
}
require.NoError(t, err)

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
comm := ctesting.NewContextComm(ctx)

go func() {
_ = fp.Run(ctx, comm)
}()

for {
fp.clientMx.Lock()
client := fp.client
fp.clientMx.Unlock()
if client != nil {
break
}
<-time.After(10 * time.Millisecond)
}

// Secret cache should be empty at start
fp.secretsCacheMx.Lock()
assert.Equal(t, 0, len(fp.secretsCache))
fp.secretsCacheMx.Unlock()

key := "kubernetes_secrets.test_namespace.testing_secret.secret_value"

// Secret should be in the cache after this call
val, found := fp.Fetch(key)
assert.True(t, found)
assert.Equal(t, val, pass)
fp.secretsCacheMx.RLock()
assert.Equal(t, len(fp.secretsCache), 1)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
assert.NotNil(t, fp.secretsCache[key])
assert.NotZero(t, fp.secretsCache[key].lastAccess)
fp.secretsCacheMx.RUnlock()

// Update the secret and check after TTL time, the secret value is correct
newPass := "new-pass"
secret = &v1.Secret{
TypeMeta: metav1.TypeMeta{
Kind: "Secret",
APIVersion: "apps/v1beta1",
},
ObjectMeta: metav1.ObjectMeta{
Name: "testing_secret",
Namespace: ns,
},
Data: map[string][]byte{
"secret_value": []byte(newPass),
},
}
_, err = client.CoreV1().Secrets(ns).Update(context.Background(), secret, metav1.UpdateOptions{})
require.NoError(t, err)

// wait for ttl update
<-time.After(ttlUpdate)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
assert.Eventuallyf(t, func() bool {
val, found = fp.Fetch(key)
return found && val == newPass
}, ttlUpdate*3, ttlUpdate, "Failed to update the secret value after TTL update has passed.")

// After TTL delete, secret should no longer be found in cache since it was never
// fetched during that time
<-time.After(ttlDelete)
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
assert.Eventuallyf(t, func() bool {
fp.secretsCacheMx.RLock()
size := len(fp.secretsCache)
fp.secretsCacheMx.RUnlock()
return size == 0
}, ttlDelete*3, ttlDelete, "Failed to delete the secret after TTL delete has passed.")

}
Loading