Skip to content

Commit

Permalink
feat(controller): add sync jitter(argoproj#14241) (argoproj#16820)
Browse files Browse the repository at this point in the history
* feat(controller): add sync jitter

Signed-off-by: Alexandre Gaudreault <[email protected]>

* convert to duration for simplicity

Signed-off-by: Alexandre Gaudreault <[email protected]>

* docs

Signed-off-by: Alexandre Gaudreault <[email protected]>

* add config to manifests

Signed-off-by: Alexandre Gaudreault <[email protected]>

* fix tests

Signed-off-by: Alexandre Gaudreault <[email protected]>

---------

Signed-off-by: Alexandre Gaudreault <[email protected]>
  • Loading branch information
agaudreault authored and Julien Fuix committed Feb 6, 2024
1 parent 4da91e2 commit a7e9f69
Show file tree
Hide file tree
Showing 13 changed files with 99 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func NewCommand() *cobra.Command {
clientConfig clientcmd.ClientConfig
appResyncPeriod int64
appHardResyncPeriod int64
appResyncJitter int64
repoErrorGracePeriod int64
repoServerAddress string
repoServerTimeoutSeconds int
Expand Down Expand Up @@ -157,6 +158,7 @@ func NewCommand() *cobra.Command {
kubectl,
resyncDuration,
hardResyncDuration,
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
Expand Down Expand Up @@ -194,6 +196,7 @@ func NewCommand() *cobra.Command {
clientConfig = cli.AddKubectlFlagsToCmd(&command)
command.Flags().Int64Var(&appResyncPeriod, "app-resync", int64(env.ParseDurationFromEnv("ARGOCD_RECONCILIATION_TIMEOUT", defaultAppResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Time period in seconds for application resync.")
command.Flags().Int64Var(&appHardResyncPeriod, "app-hard-resync", int64(env.ParseDurationFromEnv("ARGOCD_HARD_RECONCILIATION_TIMEOUT", defaultAppHardResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Time period in seconds for application hard resync.")
command.Flags().Int64Var(&appResyncJitter, "app-resync-jitter", int64(env.ParseDurationFromEnv("ARGOCD_RECONCILIATION_JITTER", 0*time.Second, 0, math.MaxInt64).Seconds()), "Maximum time period in seconds to add as a delay jitter for application resync.")
command.Flags().Int64Var(&repoErrorGracePeriod, "repo-error-grace-period-seconds", int64(env.ParseDurationFromEnv("ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS", defaultAppResyncPeriod*time.Second, 0, math.MaxInt64).Seconds()), "Grace period in seconds for ignoring consecutive errors while communicating with repo server.")
command.Flags().StringVar(&repoServerAddress, "repo-server", env.StringFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER", common.DefaultRepoServerAddr), "Repo server address.")
command.Flags().IntVar(&repoServerTimeoutSeconds, "repo-server-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_TIMEOUT_SECONDS", 60, 0, math.MaxInt64), "Repo server RPC call timeout seconds.")
Expand Down
26 changes: 21 additions & 5 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
goerrors "errors"
"fmt"
"math"
"math/rand"
"net/http"
"reflect"
"runtime/debug"
Expand Down Expand Up @@ -118,6 +119,7 @@ type ApplicationController struct {
stateCache statecache.LiveStateCache
statusRefreshTimeout time.Duration
statusHardRefreshTimeout time.Duration
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
repoClientset apiclient.Clientset
db db.ArgoDB
Expand All @@ -142,6 +144,7 @@ func NewApplicationController(
kubectl kube.Kubectl,
appResyncPeriod time.Duration,
appHardResyncPeriod time.Duration,
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
repoErrorGracePeriod time.Duration,
metricsPort int,
Expand All @@ -154,7 +157,7 @@ func NewApplicationController(
rateLimiterConfig *ratelimiter.AppControllerRateLimiterConfig,
serverSideDiff bool,
) (*ApplicationController, error) {
log.Infof("appResyncPeriod=%v, appHardResyncPeriod=%v", appResyncPeriod, appHardResyncPeriod)
log.Infof("appResyncPeriod=%v, appHardResyncPeriod=%v, appResyncJitter=%v", appResyncPeriod, appHardResyncPeriod, appResyncJitter)
db := db.NewDB(namespace, settingsMgr, kubeClientset)
if rateLimiterConfig == nil {
rateLimiterConfig = ratelimiter.GetDefaultAppRateLimiterConfig()
Expand All @@ -174,6 +177,7 @@ func NewApplicationController(
db: db,
statusRefreshTimeout: appResyncPeriod,
statusHardRefreshTimeout: appHardResyncPeriod,
statusRefreshJitter: appResyncJitter,
refreshRequestedApps: make(map[string]CompareWith),
refreshRequestedAppsMutex: &sync.Mutex{},
auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController),
Expand Down Expand Up @@ -1646,6 +1650,7 @@ func (ctrl *ApplicationController) needRefreshAppStatus(app *appv1.Application,
var reason string
compareWith := CompareWithLatest
refreshType := appv1.RefreshTypeNormal

softExpired := app.Status.ReconciledAt == nil || app.Status.ReconciledAt.Add(statusRefreshTimeout).Before(time.Now().UTC())
hardExpired := (app.Status.ReconciledAt == nil || app.Status.ReconciledAt.Add(statusHardRefreshTimeout).Before(time.Now().UTC())) && statusHardRefreshTimeout.Seconds() != 0

Expand Down Expand Up @@ -2098,14 +2103,25 @@ func (ctrl *ApplicationController) newApplicationInformerAndLister() (cache.Shar
if err != nil {
return
}

var compareWith *CompareWith
var delay *time.Duration

oldApp, oldOK := old.(*appv1.Application)
newApp, newOK := new.(*appv1.Application)
if oldOK && newOK && automatedSyncEnabled(oldApp, newApp) {
log.WithField("application", newApp.QualifiedName()).Info("Enabled automated sync")
compareWith = CompareWithLatest.Pointer()
if oldOK && newOK {
if automatedSyncEnabled(oldApp, newApp) {
log.WithField("application", newApp.QualifiedName()).Info("Enabled automated sync")
compareWith = CompareWithLatest.Pointer()
}
if ctrl.statusRefreshJitter != 0 && oldApp.ResourceVersion == newApp.ResourceVersion {
// Handler is refreshing the apps, add a random jitter to spread the load and avoid spikes
jitter := time.Duration(float64(ctrl.statusRefreshJitter) * rand.Float64())
delay = &jitter
}
}
ctrl.requestAppRefresh(newApp.QualifiedName(), compareWith, nil)

ctrl.requestAppRefresh(newApp.QualifiedName(), compareWith, delay)
ctrl.appOperationQueue.AddRateLimited(key)
},
DeleteFunc: func(obj interface{}) {
Expand Down
2 changes: 2 additions & 0 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
kubectl,
time.Minute,
time.Hour,
time.Second,
time.Minute,
time.Second*10,
common.DefaultPortArgoCDMetrics,
Expand All @@ -154,6 +155,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
nil,
data.applicationNamespaces,
nil,

false,
)
db := &dbmocks.ArgoDB{}
Expand Down
14 changes: 7 additions & 7 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ which might cause health check to return `Progressing` state instead of `Healthy
As workaround Argo CD allows providing [health check](operator-manual/health.md) customization which overrides default
behavior.

If you are using Traefik for your Ingress, you can update the Traefik config to publish the loadBalancer IP using [publishedservice](https://doc.traefik.io/traefik/providers/kubernetes-ingress/#publishedservice), which will resolve this issue.
If you are using Traefik for your Ingress, you can update the Traefik config to publish the loadBalancer IP using [publishedservice](https://doc.traefik.io/traefik/providers/kubernetes-ingress/#publishedservice), which will resolve this issue.

```yaml
providers:
Expand Down Expand Up @@ -97,7 +97,7 @@ data:

## After deploying my Helm application with Argo CD I cannot see it with `helm ls` and other Helm commands

When deploying a Helm application Argo CD is using Helm
When deploying a Helm application Argo CD is using Helm
only as a template mechanism. It runs `helm template` and
then deploys the resulting manifests on the cluster instead of doing `helm install`. This means that you cannot use any Helm command
to view/verify the application. It is fully managed by Argo CD.
Expand Down Expand Up @@ -140,15 +140,15 @@ Argo CD automatically sets the `app.kubernetes.io/instance` label and uses it to
If the tool does this too, this causes confusion. You can change this label by setting
the `application.instanceLabelKey` value in the `argocd-cm`. We recommend that you use `argocd.argoproj.io/instance`.

!!! note
!!! note
When you make this change your applications will become out of sync and will need re-syncing.

See [#1482](https://github.com/argoproj/argo-cd/issues/1482).

## How often does Argo CD check for changes to my Git or Helm repository ?

The default polling interval is 3 minutes (180 seconds).
You can change the setting by updating the `timeout.reconciliation` value in the [argocd-cm](https://github.com/argoproj/argo-cd/blob/2d6ce088acd4fb29271ffb6f6023dbb27594d59b/docs/operator-manual/argocd-cm.yaml#L279-L282) config map. If there are any Git changes, Argo CD will only update applications with the [auto-sync setting](user-guide/auto_sync.md) enabled. If you set it to `0` then Argo CD will stop polling Git repositories automatically and you can only use alternative methods such as [webhooks](operator-manual/webhook.md) and/or manual syncs for deploying applications.
The default polling interval is 3 minutes (180 seconds) with a configurable jitter.
You can change the setting by updating the `timeout.reconciliation` value and the `timeout.reconciliation.jitter` in the [argocd-cm](https://github.com/argoproj/argo-cd/blob/2d6ce088acd4fb29271ffb6f6023dbb27594d59b/docs/operator-manual/argocd-cm.yaml#L279-L282) config map. If there are any Git changes, Argo CD will only update applications with the [auto-sync setting](user-guide/auto_sync.md) enabled. If you set it to `0` then Argo CD will stop polling Git repositories automatically and you can only use alternative methods such as [webhooks](operator-manual/webhook.md) and/or manual syncs for deploying applications.


## Why Are My Resource Limits `Out Of Sync`?
Expand Down Expand Up @@ -250,7 +250,7 @@ There are two parts to the message:

> map[name:**KEY_BC** value:150] map[name:**KEY_BC** value:500] map[name:**KEY_BD** value:250] map[name:**KEY_BD** value:500] map[name:KEY_BI value:something]

You'll want to identify the keys that are duplicated -- you can focus on the first part, as each duplicated key will appear, once for each of its value with its value in the first list. The second list is really just
You'll want to identify the keys that are duplicated -- you can focus on the first part, as each duplicated key will appear, once for each of its value with its value in the first list. The second list is really just

`]`

Expand All @@ -259,7 +259,7 @@ There are two parts to the message:
This includes all of the keys. It's included for debugging purposes -- you don't need to pay much attention to it. It will give you a hint about the precise location in the list for the duplicated keys:

> map[name:KEY_AA] map[name:KEY_AB] map[name:KEY_AC] map[name:KEY_AD] map[name:KEY_AE] map[name:KEY_AF] map[name:KEY_AG] map[name:KEY_AH] map[name:KEY_AI] map[name:KEY_AJ] map[name:KEY_AK] map[name:KEY_AL] map[name:KEY_AM] map[name:KEY_AN] map[name:KEY_AO] map[name:KEY_AP] map[name:KEY_AQ] map[name:KEY_AR] map[name:KEY_AS] map[name:KEY_AT] map[name:KEY_AU] map[name:KEY_AV] map[name:KEY_AW] map[name:KEY_AX] map[name:KEY_AY] map[name:KEY_AZ] map[name:KEY_BA] map[name:KEY_BB] map[name:**KEY_BC**] map[name:**KEY_BD**] map[name:KEY_BE] map[name:KEY_BF] map[name:KEY_BG] map[name:KEY_BH] map[name:KEY_BI] map[name:**KEY_BC**] map[name:**KEY_BD**]

`]`

In this case, the duplicated keys have been **emphasized** to help you identify the problematic keys. Many editors have the ability to highlight all instances of a string, using such an editor can help with such problems.
Expand Down
35 changes: 23 additions & 12 deletions docs/operator-manual/high_availability.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ performance. For performance reasons the controller monitors and caches only the
preferred version into a version of the resource stored in Git. If `kubectl convert` fails because the conversion is not supported then the controller falls back to Kubernetes API query which slows down
reconciliation. In this case, we advise to use the preferred resource version in Git.

* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` setting in the `argocd-cm` ConfigMap. The value of `timeout.reconciliation` is a duration string e.g `60s`, `1m`, `1h` or `1d`.
* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` and `timeout.reconciliation.jitter` setting in the `argocd-cm` ConfigMap. The value of the fields is a duration string e.g `60s`, `1m`, `1h` or `1d`.

* If the controller is managing too many clusters and uses too much memory then you can shard clusters across multiple
controller replicas. To enable sharding, increase the number of replicas in `argocd-application-controller` `StatefulSet`
Expand Down Expand Up @@ -244,30 +244,41 @@ spec:
# ...
```

### Application Sync Timeout & Jitter

Argo CD has a timeout for application syncs. It will trigger a refresh for each application periodically when the timeout expires.
With a large number of applications, this will cause a spike in the refresh queue and can cause a spike to the repo-server component. To avoid this, you can set a jitter to the sync timeout which will spread out the refreshes and give time to the repo-server to catch up.

The jitter is the maximum duration that can be added to the sync timeout, so if the sync timeout is 5 minutes and the jitter is 1 minute, then the actual timeout will be between 5 and 6 minutes.

To configure the jitter you can set the following environment variables:

* `ARGOCD_RECONCILIATION_JITTER` - The jitter to apply to the sync timeout. Disabled when value is 0. Defaults to 0.

## Rate Limiting Application Reconciliations

To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors,
To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors,
we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured:

* Global rate limits
* Per item rate limits

The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`.
The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`.

### Global rate limits

This is enabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second.
This is useful to prevent a large number of apps from being queued at the same time.
This is useful to prevent a large number of apps from being queued at the same time.

To configure the bucket limiter you can set the following environment variables:

* `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500.
* `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to 50.

### Per item rate limits
### Per item rate limits

This by default returns a fixed base delay/backoff value but can be configured to return exponential values, read further to understand it's working.
Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially
This by default returns a fixed base delay/backoff value but can be configured to return exponential values.
Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially
if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued.

To configure the per item limiter you can set the following environment variables:
Expand All @@ -277,16 +288,16 @@ To configure the per item limiter you can set the following environment variable
* `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s)
* `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5

The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued
The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued
and `lastRequeueTime` is the time at which the item was last queued:

- When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 :

```
backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ?
WORKQUEUE_BASE_DELAY_NS :
backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ?
WORKQUEUE_BASE_DELAY_NS :
min(
WORKQUEUE_MAX_DELAY_NS,
WORKQUEUE_MAX_DELAY_NS,
WORKQUEUE_BASE_DELAY_NS * WORKQUEUE_BACKOFF_FACTOR ^ (numRequeue)
)
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ argocd-application-controller [flags]
```
--app-hard-resync int Time period in seconds for application hard resync.
--app-resync int Time period in seconds for application resync. (default 180)
--app-resync-jitter int Maximum time period in seconds to add as a delay jitter for application resync.
--app-state-cache-expiration duration Cache expiration for app state (default 1h0m0s)
--application-namespaces strings List of additional namespaces that applications are allowed to be reconciled from
--as string Username to impersonate for the operation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ spec:
name: argocd-cm
key: timeout.hard.reconciliation
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ spec:
name: argocd-cm
key: timeout.hard.reconciliation
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/core-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21514,6 +21514,12 @@ spec:
key: timeout.hard.reconciliation
name: argocd-cm
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/ha/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23453,6 +23453,12 @@ spec:
key: timeout.hard.reconciliation
name: argocd-cm
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/ha/namespace-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2719,6 +2719,12 @@ spec:
key: timeout.hard.reconciliation
name: argocd-cm
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22498,6 +22498,12 @@ spec:
key: timeout.hard.reconciliation
name: argocd-cm
optional: true
- name: ARGOCD_RECONCILIATION_JITTER
valueFrom:
configMapKeyRef:
key: timeout.reconciliation.jitter
name: argocd-cm
optional: true
- name: ARGOCD_REPO_ERROR_GRACE_PERIOD_SECONDS
valueFrom:
configMapKeyRef:
Expand Down
Loading

0 comments on commit a7e9f69

Please sign in to comment.