Merge pull request kubernetes#3589 from jbartosik/cache-controllers

Cache controllers
crgarcia12 · Nov 25, 2020 · 057f57f · 057f57f
2 parents ec7a71c + 9d7898a
commit 057f57f
Show file tree

Hide file tree

Showing 5 changed files with 436 additions and 11 deletions.
diff --git a/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go b/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go
@@ -49,7 +49,13 @@ import (
 	resourceclient "k8s.io/metrics/pkg/client/clientset/versioned/typed/metrics/v1beta1"
 )
 
-const defaultResyncPeriod time.Duration = 10 * time.Minute
+const (
+	scaleCacheLoopPeriod         time.Duration = 7 * time.Second
+	scaleCacheEntryLifetime      time.Duration = time.Hour
+	scaleCacheEntryFreshnessTime time.Duration = 10 * time.Minute
+	scaleCacheEntryJitterFactor  float64       = 1.
+	defaultResyncPeriod          time.Duration = 10 * time.Minute
+)
 
 // ClusterStateFeeder can update state of ClusterState object.
 type ClusterStateFeeder interface {
@@ -108,7 +114,8 @@ func NewClusterStateFeeder(config *rest.Config, clusterState *model.ClusterState
 	kubeClient := kube_client.NewForConfigOrDie(config)
 	podLister, oomObserver := NewPodListerAndOOMObserver(kubeClient, namespace)
 	factory := informers.NewSharedInformerFactoryWithOptions(kubeClient, defaultResyncPeriod, informers.WithNamespace(namespace))
-	controllerFetcher := controllerfetcher.NewControllerFetcher(config, kubeClient, factory)
+	controllerFetcher := controllerfetcher.NewControllerFetcher(config, kubeClient, factory, scaleCacheEntryFreshnessTime, scaleCacheEntryLifetime, scaleCacheEntryJitterFactor)
+	controllerFetcher.Start(context.TODO(), scaleCacheLoopPeriod)
 	return ClusterStateFeederFactory{
 		PodLister:           podLister,
 		OOMObserver:         oomObserver,

diff --git a/vertical-pod-autoscaler/pkg/recommender/input/controller_fetcher/controller_cache_storage.go b/vertical-pod-autoscaler/pkg/recommender/input/controller_fetcher/controller_cache_storage.go
@@ -0,0 +1,168 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controllerfetcher
+
+import (
+	"sync"
+	"time"
+
+	autoscalingapi "k8s.io/api/autoscaling/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/klog"
+)
+
+// Allows tests to inject their time.
+var now = time.Now
+
+type scaleCacheKey struct {
+	namespace     string
+	groupResource schema.GroupResource
+	name          string
+}
+
+type scaleCacheEntry struct {
+	refreshAfter time.Time
+	deleteAfter  time.Time
+	resource     *autoscalingapi.Scale
+	err          error
+}
+
+// Cache for responses to get queries on controllers. Thread safe.
+// Usage:
+// - `Get` cached response. If there is one use it, otherwise make query and
+// - `Insert` the response you got into the cache.
+// When you create a `controllerCacheStorage` you should start two go routines:
+// - One for refreshing cache entries, which calls `GetKeysToRefresh` then for
+//   each key makes query to the API server and calls `Refresh` to update
+//  content of the cache.
+// - Second for removing stale entries which periodically calls `RemoveExpired`
+// Each entry is refreshed after duration
+// `validityTime` * (1 + `jitterFactor`)
+// passes and is removed if there are no reads for it for more than `lifeTime`.
+//
+// Sometimes refreshing might take longer than refreshAfter (for example when
+// VPA is starting in a big cluster and tries to fetch all controllers). To
+// handle such situation lifeTime should be longer than refreshAfter so the main
+// VPA loop can do its work quickly, using the cached information (instead of
+// getting stuck on refreshing the cache).
+// TODO(jbartosik): Add a way to detect when we don't refresh cache frequently
+// enough.
+// TODO(jbartosik): Add a way to learn how long we keep entries around so we can
+// decide if / how we want to optimize entry refreshes.
+type controllerCacheStorage struct {
+	cache        map[scaleCacheKey]scaleCacheEntry
+	mux          sync.Mutex
+	validityTime time.Duration
+	jitterFactor float64
+	lifeTime     time.Duration
+}
+
+// Returns bool indicating whether the entry was present in the cache and the cached response.
+// Updates deleteAfter for the element.
+func (cc *controllerCacheStorage) Get(namespace string, groupResource schema.GroupResource, name string) (ok bool, controller *autoscalingapi.Scale, err error) {
+	key := scaleCacheKey{namespace: namespace, groupResource: groupResource, name: name}
+	cc.mux.Lock()
+	defer cc.mux.Unlock()
+	r, ok := cc.cache[key]
+	if ok {
+		r.deleteAfter = now().Add(cc.lifeTime)
+		cc.cache[key] = r
+	}
+	return ok, r.resource, r.err
+}
+
+// If key is in the cache, refresh updates the cached value, error and refresh
+// time (but not time to remove).
+// If the key is missing from the cache does nothing (relevant when we're
+// concurrently updating cache and removing stale entries from it, to avoid
+// adding back an entry which we just removed).
+func (cc *controllerCacheStorage) Refresh(namespace string, groupResource schema.GroupResource, name string, controller *autoscalingapi.Scale, err error) {
+	key := scaleCacheKey{namespace: namespace, groupResource: groupResource, name: name}
+	cc.mux.Lock()
+	defer cc.mux.Unlock()
+	old, ok := cc.cache[key]
+	if !ok {
+		return
+	}
+	// We refresh entries that are waiting to be removed. So when we refresh an
+	// entry we mustn't change entries deleteAfter time (otherwise we risk never
+	// removing entries that are not being read).
+	cc.cache[key] = scaleCacheEntry{
+		refreshAfter: now().Add(wait.Jitter(cc.validityTime, cc.jitterFactor)),
+		deleteAfter:  old.deleteAfter,
+		resource:     controller,
+		err:          err,
+	}
+}
+
+// If the key is missing from the cache, updates the cached value, error and refresh time (but not deleteAfter time).
+// If key is in the cache, does nothing (to make sure updating element doesn't change its deleteAfter time).
+func (cc *controllerCacheStorage) Insert(namespace string, groupResource schema.GroupResource, name string, controller *autoscalingapi.Scale, err error) {
+	key := scaleCacheKey{namespace: namespace, groupResource: groupResource, name: name}
+	cc.mux.Lock()
+	defer cc.mux.Unlock()
+	if _, ok := cc.cache[key]; ok {
+		return
+	}
+	now := now()
+	cc.cache[key] = scaleCacheEntry{
+		refreshAfter: now.Add(wait.Jitter(cc.validityTime, cc.jitterFactor)),
+		deleteAfter:  now.Add(cc.lifeTime),
+		resource:     controller,
+		err:          err,
+	}
+}
+
+// Removes entries which we didn't read in a while from the cache.
+func (cc *controllerCacheStorage) RemoveExpired() {
+	klog.V(1).Info("Removing entries from controllerCacheStorage")
+	cc.mux.Lock()
+	defer cc.mux.Unlock()
+	now := now()
+	removed := 0
+	for k, v := range cc.cache {
+		if now.After(v.deleteAfter) {
+			removed += 1
+			delete(cc.cache, k)
+		}
+	}
+	klog.V(1).Infof("Removed %d entries from controllerCacheStorage", removed)
+}
+
+// Returns a list of keys for which values need to be refreshed
+func (cc *controllerCacheStorage) GetKeysToRefresh() []scaleCacheKey {
+	result := make([]scaleCacheKey, 0)
+	cc.mux.Lock()
+	defer cc.mux.Unlock()
+	now := now()
+	for k, v := range cc.cache {
+		if now.After(v.refreshAfter) {
+			result = append(result, k)
+		}
+	}
+	return result
+}
+
+func newControllerCacheStorage(validityTime, lifeTime time.Duration, jitterFactor float64) controllerCacheStorage {
+	return controllerCacheStorage{
+		cache:        make(map[scaleCacheKey]scaleCacheEntry),
+		validityTime: validityTime,
+		jitterFactor: jitterFactor,
+		lifeTime:     lifeTime,
+	}
+}