From a7e9f694f4b9ce626c815735aaf3766b48f9519d Mon Sep 17 00:00:00 2001 From: Alexandre Gaudreault Date: Sun, 21 Jan 2024 01:22:32 -0500 Subject: [PATCH] feat(controller): add sync jitter(#14241) (#16820) * feat(controller): add sync jitter Signed-off-by: Alexandre Gaudreault * convert to duration for simplicity Signed-off-by: Alexandre Gaudreault * docs Signed-off-by: Alexandre Gaudreault * add config to manifests Signed-off-by: Alexandre Gaudreault * fix tests Signed-off-by: Alexandre Gaudreault --------- Signed-off-by: Alexandre Gaudreault --- .../commands/argocd_application_controller.go | 3 ++ controller/appcontroller.go | 26 +++++++++++--- controller/appcontroller_test.go | 2 ++ docs/faq.md | 14 ++++---- docs/operator-manual/high_availability.md | 35 ++++++++++++------- .../argocd-application-controller.md | 1 + ...ocd-application-controller-deployment.yaml | 6 ++++ ...cd-application-controller-statefulset.yaml | 6 ++++ manifests/core-install.yaml | 6 ++++ manifests/ha/install.yaml | 6 ++++ manifests/ha/namespace-install.yaml | 6 ++++ manifests/install.yaml | 6 ++++ manifests/namespace-install.yaml | 6 ++++ 13 files changed, 99 insertions(+), 24 deletions(-) diff --git a/cmd/argocd-application-controller/commands/argocd_application_controller.go b/cmd/argocd-application-controller/commands/argocd_application_controller.go index 135bcab3a72988..8004340250611e 100644 --- a/cmd/argocd-application-controller/commands/argocd_application_controller.go +++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go @@ -50,6 +50,7 @@ func NewCommand() *cobra.Command { clientConfig clientcmd.ClientConfig appResyncPeriod int64 appHardResyncPeriod int64 + appResyncJitter int64 repoErrorGracePeriod int64 repoServerAddress string repoServerTimeoutSeconds int @@ -157,6 +158,7 @@ func NewCommand() *cobra.Command { kubectl, resyncDuration, hardResyncDuration, + time.Duration(appResyncJitter)*time.Second, time.Duration(selfHealTimeoutSeconds)*time.Second, time.Duration(repoErrorGracePeriod)*time.Second, metricsPort, @@ -194,6 +196,7 @@ func NewCommand() *cobra.Command { clientConfig = cli.AddKubectlFlagsToCmd(&command) command.Flags().Int64Var(&appResyncPeriod, "app-resync", int64(env.ParseDurationFromEnv("ARGOCD_RECONCILIATION_TIMEOUT", defaultAppResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Time period in seconds for application resync.") command.Flags().Int64Var(&appHardResyncPeriod, "app-hard-resync", int64(env.ParseDurationFromEnv("ARGOCD_HARD_RECONCILIATION_TIMEOUT", defaultAppHardResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Time period in seconds for application hard resync.") + command.Flags().Int64Var(&appResyncJitter, "app-resync-jitter", int64(env.ParseDurationFromEnv("ARGOCD_RECONCILIATION_JITTER", 0*time.Second, 0, math.MaxInt64).Seconds()), "Maximum time period in seconds to add as a delay jitter for application resync.") command.Flags().Int64Var(&repoErrorGracePeriod, "repo-error-grace-period-seconds", int64(env.ParseDurationFromEnv("ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS", defaultAppResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Grace period in seconds for ignoring consecutive errors while communicating with repo server.") command.Flags().StringVar(&repoServerAddress, "repo-server", env.StringFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER", common.DefaultRepoServerAddr), "Repo server address.") command.Flags().IntVar(&repoServerTimeoutSeconds, "repo-server-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_TIMEOUT_SECONDS", 60, 0, math.MaxInt64), "Repo server RPC call timeout seconds.") diff --git a/controller/appcontroller.go b/controller/appcontroller.go index bb4ccb1b52f2ab..8d9e595fecf4eb 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -6,6 +6,7 @@ import ( goerrors "errors" "fmt" "math" + "math/rand" "net/http" "reflect" "runtime/debug" @@ -118,6 +119,7 @@ type ApplicationController struct { stateCache statecache.LiveStateCache statusRefreshTimeout time.Duration statusHardRefreshTimeout time.Duration + statusRefreshJitter time.Duration selfHealTimeout time.Duration repoClientset apiclient.Clientset db db.ArgoDB @@ -142,6 +144,7 @@ func NewApplicationController( kubectl kube.Kubectl, appResyncPeriod time.Duration, appHardResyncPeriod time.Duration, + appResyncJitter time.Duration, selfHealTimeout time.Duration, repoErrorGracePeriod time.Duration, metricsPort int, @@ -154,7 +157,7 @@ func NewApplicationController( rateLimiterConfig *ratelimiter.AppControllerRateLimiterConfig, serverSideDiff bool, ) (*ApplicationController, error) { - log.Infof("appResyncPeriod=%v, appHardResyncPeriod=%v", appResyncPeriod, appHardResyncPeriod) + log.Infof("appResyncPeriod=%v, appHardResyncPeriod=%v, appResyncJitter=%v", appResyncPeriod, appHardResyncPeriod, appResyncJitter) db := db.NewDB(namespace, settingsMgr, kubeClientset) if rateLimiterConfig == nil { rateLimiterConfig = ratelimiter.GetDefaultAppRateLimiterConfig() @@ -174,6 +177,7 @@ func NewApplicationController( db: db, statusRefreshTimeout: appResyncPeriod, statusHardRefreshTimeout: appHardResyncPeriod, + statusRefreshJitter: appResyncJitter, refreshRequestedApps: make(map[string]CompareWith), refreshRequestedAppsMutex: &sync.Mutex{}, auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController), @@ -1646,6 +1650,7 @@ func (ctrl *ApplicationController) needRefreshAppStatus(app *appv1.Application, var reason string compareWith := CompareWithLatest refreshType := appv1.RefreshTypeNormal + softExpired := app.Status.ReconciledAt == nil || app.Status.ReconciledAt.Add(statusRefreshTimeout).Before(time.Now().UTC()) hardExpired := (app.Status.ReconciledAt == nil || app.Status.ReconciledAt.Add(statusHardRefreshTimeout).Before(time.Now().UTC())) && statusHardRefreshTimeout.Seconds() != 0 @@ -2098,14 +2103,25 @@ func (ctrl *ApplicationController) newApplicationInformerAndLister() (cache.Shar if err != nil { return } + var compareWith *CompareWith + var delay *time.Duration + oldApp, oldOK := old.(*appv1.Application) newApp, newOK := new.(*appv1.Application) - if oldOK && newOK && automatedSyncEnabled(oldApp, newApp) { - log.WithField("application", newApp.QualifiedName()).Info("Enabled automated sync") - compareWith = CompareWithLatest.Pointer() + if oldOK && newOK { + if automatedSyncEnabled(oldApp, newApp) { + log.WithField("application", newApp.QualifiedName()).Info("Enabled automated sync") + compareWith = CompareWithLatest.Pointer() + } + if ctrl.statusRefreshJitter != 0 && oldApp.ResourceVersion == newApp.ResourceVersion { + // Handler is refreshing the apps, add a random jitter to spread the load and avoid spikes + jitter := time.Duration(float64(ctrl.statusRefreshJitter) * rand.Float64()) + delay = &jitter + } } - ctrl.requestAppRefresh(newApp.QualifiedName(), compareWith, nil) + + ctrl.requestAppRefresh(newApp.QualifiedName(), compareWith, delay) ctrl.appOperationQueue.AddRateLimited(key) }, DeleteFunc: func(obj interface{}) { diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index 131c1deab99b00..4162a9983e9419 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -144,6 +144,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController { kubectl, time.Minute, time.Hour, + time.Second, time.Minute, time.Second*10, common.DefaultPortArgoCDMetrics, @@ -154,6 +155,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController { nil, data.applicationNamespaces, nil, + false, ) db := &dbmocks.ArgoDB{} diff --git a/docs/faq.md b/docs/faq.md index 95205ae6bbb4ec..83bdf8d7d38b5c 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -36,7 +36,7 @@ which might cause health check to return `Progressing` state instead of `Healthy As workaround Argo CD allows providing [health check](operator-manual/health.md) customization which overrides default behavior. -If you are using Traefik for your Ingress, you can update the Traefik config to publish the loadBalancer IP using [publishedservice](https://doc.traefik.io/traefik/providers/kubernetes-ingress/#publishedservice), which will resolve this issue. +If you are using Traefik for your Ingress, you can update the Traefik config to publish the loadBalancer IP using [publishedservice](https://doc.traefik.io/traefik/providers/kubernetes-ingress/#publishedservice), which will resolve this issue. ```yaml providers: @@ -97,7 +97,7 @@ data: ## After deploying my Helm application with Argo CD I cannot see it with `helm ls` and other Helm commands -When deploying a Helm application Argo CD is using Helm +When deploying a Helm application Argo CD is using Helm only as a template mechanism. It runs `helm template` and then deploys the resulting manifests on the cluster instead of doing `helm install`. This means that you cannot use any Helm command to view/verify the application. It is fully managed by Argo CD. @@ -140,15 +140,15 @@ Argo CD automatically sets the `app.kubernetes.io/instance` label and uses it to If the tool does this too, this causes confusion. You can change this label by setting the `application.instanceLabelKey` value in the `argocd-cm`. We recommend that you use `argocd.argoproj.io/instance`. -!!! note +!!! note When you make this change your applications will become out of sync and will need re-syncing. See [#1482](https://github.com/argoproj/argo-cd/issues/1482). ## How often does Argo CD check for changes to my Git or Helm repository ? -The default polling interval is 3 minutes (180 seconds). -You can change the setting by updating the `timeout.reconciliation` value in the [argocd-cm](https://github.com/argoproj/argo-cd/blob/2d6ce088acd4fb29271ffb6f6023dbb27594d59b/docs/operator-manual/argocd-cm.yaml#L279-L282) config map. If there are any Git changes, Argo CD will only update applications with the [auto-sync setting](user-guide/auto_sync.md) enabled. If you set it to `0` then Argo CD will stop polling Git repositories automatically and you can only use alternative methods such as [webhooks](operator-manual/webhook.md) and/or manual syncs for deploying applications. +The default polling interval is 3 minutes (180 seconds) with a configurable jitter. +You can change the setting by updating the `timeout.reconciliation` value and the `timeout.reconciliation.jitter` in the [argocd-cm](https://github.com/argoproj/argo-cd/blob/2d6ce088acd4fb29271ffb6f6023dbb27594d59b/docs/operator-manual/argocd-cm.yaml#L279-L282) config map. If there are any Git changes, Argo CD will only update applications with the [auto-sync setting](user-guide/auto_sync.md) enabled. If you set it to `0` then Argo CD will stop polling Git repositories automatically and you can only use alternative methods such as [webhooks](operator-manual/webhook.md) and/or manual syncs for deploying applications. ## Why Are My Resource Limits `Out Of Sync`? @@ -250,7 +250,7 @@ There are two parts to the message: > map[name:**KEY_BC** value:150] map[name:**KEY_BC** value:500] map[name:**KEY_BD** value:250] map[name:**KEY_BD** value:500] map[name:KEY_BI value:something] - You'll want to identify the keys that are duplicated -- you can focus on the first part, as each duplicated key will appear, once for each of its value with its value in the first list. The second list is really just + You'll want to identify the keys that are duplicated -- you can focus on the first part, as each duplicated key will appear, once for each of its value with its value in the first list. The second list is really just `]` @@ -259,7 +259,7 @@ There are two parts to the message: This includes all of the keys. It's included for debugging purposes -- you don't need to pay much attention to it. It will give you a hint about the precise location in the list for the duplicated keys: > map[name:KEY_AA] map[name:KEY_AB] map[name:KEY_AC] map[name:KEY_AD] map[name:KEY_AE] map[name:KEY_AF] map[name:KEY_AG] map[name:KEY_AH] map[name:KEY_AI] map[name:KEY_AJ] map[name:KEY_AK] map[name:KEY_AL] map[name:KEY_AM] map[name:KEY_AN] map[name:KEY_AO] map[name:KEY_AP] map[name:KEY_AQ] map[name:KEY_AR] map[name:KEY_AS] map[name:KEY_AT] map[name:KEY_AU] map[name:KEY_AV] map[name:KEY_AW] map[name:KEY_AX] map[name:KEY_AY] map[name:KEY_AZ] map[name:KEY_BA] map[name:KEY_BB] map[name:**KEY_BC**] map[name:**KEY_BD**] map[name:KEY_BE] map[name:KEY_BF] map[name:KEY_BG] map[name:KEY_BH] map[name:KEY_BI] map[name:**KEY_BC**] map[name:**KEY_BD**] - + `]` In this case, the duplicated keys have been **emphasized** to help you identify the problematic keys. Many editors have the ability to highlight all instances of a string, using such an editor can help with such problems. diff --git a/docs/operator-manual/high_availability.md b/docs/operator-manual/high_availability.md index b2051ad1a152ce..0a011104967f1e 100644 --- a/docs/operator-manual/high_availability.md +++ b/docs/operator-manual/high_availability.md @@ -57,7 +57,7 @@ performance. For performance reasons the controller monitors and caches only the preferred version into a version of the resource stored in Git. If `kubectl convert` fails because the conversion is not supported then the controller falls back to Kubernetes API query which slows down reconciliation. In this case, we advise to use the preferred resource version in Git. -* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` setting in the `argocd-cm` ConfigMap. The value of `timeout.reconciliation` is a duration string e.g `60s`, `1m`, `1h` or `1d`. +* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` and `timeout.reconciliation.jitter` setting in the `argocd-cm` ConfigMap. The value of the fields is a duration string e.g `60s`, `1m`, `1h` or `1d`. * If the controller is managing too many clusters and uses too much memory then you can shard clusters across multiple controller replicas. To enable sharding, increase the number of replicas in `argocd-application-controller` `StatefulSet` @@ -244,30 +244,41 @@ spec: # ... ``` +### Application Sync Timeout & Jitter + +Argo CD has a timeout for application syncs. It will trigger a refresh for each application periodically when the timeout expires. +With a large number of applications, this will cause a spike in the refresh queue and can cause a spike to the repo-server component. To avoid this, you can set a jitter to the sync timeout which will spread out the refreshes and give time to the repo-server to catch up. + +The jitter is the maximum duration that can be added to the sync timeout, so if the sync timeout is 5 minutes and the jitter is 1 minute, then the actual timeout will be between 5 and 6 minutes. + +To configure the jitter you can set the following environment variables: + +* `ARGOCD_RECONCILIATION_JITTER` - The jitter to apply to the sync timeout. Disabled when value is 0. Defaults to 0. + ## Rate Limiting Application Reconciliations -To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors, +To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors, we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured: * Global rate limits * Per item rate limits -The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`. +The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`. ### Global rate limits This is enabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second. -This is useful to prevent a large number of apps from being queued at the same time. - +This is useful to prevent a large number of apps from being queued at the same time. + To configure the bucket limiter you can set the following environment variables: * `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500. * `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to 50. -### Per item rate limits +### Per item rate limits - This by default returns a fixed base delay/backoff value but can be configured to return exponential values, read further to understand it's working. -Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially + This by default returns a fixed base delay/backoff value but can be configured to return exponential values. +Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued. To configure the per item limiter you can set the following environment variables: @@ -277,16 +288,16 @@ To configure the per item limiter you can set the following environment variable * `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s) * `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5 -The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued +The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued and `lastRequeueTime` is the time at which the item was last queued: - When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 : ``` -backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ? - WORKQUEUE_BASE_DELAY_NS : +backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ? + WORKQUEUE_BASE_DELAY_NS : min( - WORKQUEUE_MAX_DELAY_NS, + WORKQUEUE_MAX_DELAY_NS, WORKQUEUE_BASE_DELAY_NS * WORKQUEUE_BACKOFF_FACTOR ^ (numRequeue) ) ``` diff --git a/docs/operator-manual/server-commands/argocd-application-controller.md b/docs/operator-manual/server-commands/argocd-application-controller.md index 1d71fc6494445a..f4057bf7b04cca 100644 --- a/docs/operator-manual/server-commands/argocd-application-controller.md +++ b/docs/operator-manual/server-commands/argocd-application-controller.md @@ -17,6 +17,7 @@ argocd-application-controller [flags] ``` --app-hard-resync int Time period in seconds for application hard resync. --app-resync int Time period in seconds for application resync. (default 180) + --app-resync-jitter int Maximum time period in seconds to add as a delay jitter for application resync. --app-state-cache-expiration duration Cache expiration for app state (default 1h0m0s) --application-namespaces strings List of additional namespaces that applications are allowed to be reconciled from --as string Username to impersonate for the operation diff --git a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml index 0fbf979809c976..bcaf2d4bb5894e 100644 --- a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml +++ b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml @@ -34,6 +34,12 @@ spec: name: argocd-cm key: timeout.hard.reconciliation optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index 62f98a14492153..d974edffdd6185 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -35,6 +35,12 @@ spec: name: argocd-cm key: timeout.hard.reconciliation optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index 08d7d972e63627..254cd6e22044f7 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -21514,6 +21514,12 @@ spec: key: timeout.hard.reconciliation name: argocd-cm optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index 2029be1e07e12e..e3433300508557 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -23453,6 +23453,12 @@ spec: key: timeout.hard.reconciliation name: argocd-cm optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index 01a8da2ffd7b9d..ccac170de7e192 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -2719,6 +2719,12 @@ spec: key: timeout.hard.reconciliation name: argocd-cm optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/install.yaml b/manifests/install.yaml index 62c8b89598d46e..e65d42b5c66a01 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -22498,6 +22498,12 @@ spec: key: timeout.hard.reconciliation name: argocd-cm optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 76301680f195a4..ab6e3b63348fd9 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -1763,6 +1763,12 @@ spec: key: timeout.hard.reconciliation name: argocd-cm optional: true + - name: ARGOCD_RECONCILIATION_JITTER + valueFrom: + configMapKeyRef: + key: timeout.reconciliation.jitter + name: argocd-cm + optional: true - name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS valueFrom: configMapKeyRef: