diff --git a/CHANGELOG.md b/CHANGELOG.md index d6a42fbd926..0468a816ed2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,23 @@ ## Grafana Mimir - main / unreleased * [CHANGE] Compactor: No longer upload debug meta files to object storage. #1257 +* [CHANGE] Default values have changed for the following settings: #1547 + - `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` now defaults to 100 MiB (previously was not configurable and set to 16 MiB) + - `-alertmanager.alertmanager-client.grpc-max-send-msg-size` now defaults to 100 MiB (previously was not configurable and set to 4 MiB) + - `-alertmanager.max-recv-msg-size` now defaults to 100 MiB (previously was 16 MiB) * [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474 * [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052 +* [ENHANCEMENT] Alertmanager API: Concurrency limit for GET requests is now configurable using `-alertmanager.max-concurrent-get-requests-per-tenant`. #1547 +* [ENHANCEMENT] Alertmanager: Added the ability to configure additional gRPC client settings for the Alertmanager distributor #1547 + - `-alertmanager.alertmanager-client.backoff-max-period` + - `-alertmanager.alertmanager-client.backoff-min-period` + - `-alertmanager.alertmanager-client.backoff-on-ratelimits` + - `-alertmanager.alertmanager-client.backoff-retries` + - `-alertmanager.alertmanager-client.grpc-client-rate-limit` + - `-alertmanager.alertmanager-client.grpc-client-rate-limit-burst` + - `-alertmanager.alertmanager-client.grpc-compression` + - `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` + - `-alertmanager.alertmanager-client.grpc-max-send-msg-size` * [ENHANCEMENT] Ruler: Add more detailed query information to ruler query stats logging. #1411 * [ENHANCEMENT] Admin: Admin API now has some styling. #1482 * [BUGFIX] Query-frontend: do not shard queries with a subquery unless the subquery is inside a shardable aggregation function call. #1542 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index c750e97ff2e..3a1f53cd73b 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -7953,7 +7953,7 @@ "required": false, "desc": "Maximum size (bytes) of an accepted HTTP request body.", "fieldValue": null, - "fieldDefaultValue": 16777216, + "fieldDefaultValue": 104857600, "fieldFlag": "alertmanager.max-recv-msg-size", "fieldType": "int", "fieldCategory": "advanced" @@ -8387,6 +8387,17 @@ "fieldType": "boolean", "fieldCategory": "advanced" }, + { + "kind": "field", + "name": "max_concurrent_get_requests_per_tenant", + "required": false, + "desc": "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "alertmanager.max-concurrent-get-requests-per-tenant", + "fieldType": "int", + "fieldCategory": "advanced" + }, { "kind": "block", "name": "alertmanager_client", @@ -8404,6 +8415,115 @@ "fieldType": "duration", "fieldCategory": "advanced" }, + { + "kind": "field", + "name": "max_recv_msg_size", + "required": false, + "desc": "gRPC client max receive message size (bytes).", + "fieldValue": null, + "fieldDefaultValue": 104857600, + "fieldFlag": "alertmanager.alertmanager-client.grpc-max-recv-msg-size", + "fieldType": "int", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "max_send_msg_size", + "required": false, + "desc": "gRPC client max send message size (bytes).", + "fieldValue": null, + "fieldDefaultValue": 104857600, + "fieldFlag": "alertmanager.alertmanager-client.grpc-max-send-msg-size", + "fieldType": "int", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "grpc_compression", + "required": false, + "desc": "Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "alertmanager.alertmanager-client.grpc-compression", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "rate_limit", + "required": false, + "desc": "Rate limit for gRPC client; 0 means disabled.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit", + "fieldType": "float", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "rate_limit_burst", + "required": false, + "desc": "Rate limit burst for gRPC client.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit-burst", + "fieldType": "int", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "backoff_on_ratelimits", + "required": false, + "desc": "Enable backoff and retry when we hit ratelimits.", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "alertmanager.alertmanager-client.backoff-on-ratelimits", + "fieldType": "boolean", + "fieldCategory": "advanced" + }, + { + "kind": "block", + "name": "backoff_config", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "min_period", + "required": false, + "desc": "Minimum delay when backing off.", + "fieldValue": null, + "fieldDefaultValue": 100000000, + "fieldFlag": "alertmanager.alertmanager-client.backoff-min-period", + "fieldType": "duration", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "max_period", + "required": false, + "desc": "Maximum delay when backing off.", + "fieldValue": null, + "fieldDefaultValue": 10000000000, + "fieldFlag": "alertmanager.alertmanager-client.backoff-max-period", + "fieldType": "duration", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "max_retries", + "required": false, + "desc": "Number of times to backoff and retry before failing.", + "fieldValue": null, + "fieldDefaultValue": 10, + "fieldFlag": "alertmanager.alertmanager-client.backoff-retries", + "fieldType": "int", + "fieldCategory": "advanced" + } + ], + "fieldValue": null, + "fieldDefaultValue": null + }, { "kind": "field", "name": "tls_enabled", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index ef2235be740..02b0cd0f091 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -99,6 +99,24 @@ Usage of ./cmd/mimir/mimir: OpenStack Swift user ID. -alertmanager-storage.swift.username string OpenStack Swift username. + -alertmanager.alertmanager-client.backoff-max-period duration + Maximum delay when backing off. (default 10s) + -alertmanager.alertmanager-client.backoff-min-period duration + Minimum delay when backing off. (default 100ms) + -alertmanager.alertmanager-client.backoff-on-ratelimits + Enable backoff and retry when we hit ratelimits. + -alertmanager.alertmanager-client.backoff-retries int + Number of times to backoff and retry before failing. (default 10) + -alertmanager.alertmanager-client.grpc-client-rate-limit float + Rate limit for gRPC client; 0 means disabled. + -alertmanager.alertmanager-client.grpc-client-rate-limit-burst int + Rate limit burst for gRPC client. + -alertmanager.alertmanager-client.grpc-compression string + Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression) + -alertmanager.alertmanager-client.grpc-max-recv-msg-size int + gRPC client max receive message size (bytes). (default 104857600) + -alertmanager.alertmanager-client.grpc-max-send-msg-size int + gRPC client max send message size (bytes). (default 104857600) -alertmanager.alertmanager-client.remote-timeout duration Timeout for downstream alertmanagers. (default 2s) -alertmanager.alertmanager-client.tls-ca-path string @@ -123,12 +141,14 @@ Usage of ./cmd/mimir/mimir: Maximum number of alerts that a single tenant can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit. -alertmanager.max-alerts-size-bytes int Maximum total size of alerts that a single tenant can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit. + -alertmanager.max-concurrent-get-requests-per-tenant int + Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit. -alertmanager.max-config-size-bytes int Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit. -alertmanager.max-dispatcher-aggregation-groups int Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit. -alertmanager.max-recv-msg-size int - Maximum size (bytes) of an accepted HTTP request body. (default 16777216) + Maximum size (bytes) of an accepted HTTP request body. (default 104857600) -alertmanager.max-template-size-bytes int Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit. -alertmanager.max-templates-count int diff --git a/docs/sources/operators-guide/configuring/reference-configuration-parameters/index.md b/docs/sources/operators-guide/configuring/reference-configuration-parameters/index.md index 05be37f137d..7875315585f 100644 --- a/docs/sources/operators-guide/configuring/reference-configuration-parameters/index.md +++ b/docs/sources/operators-guide/configuring/reference-configuration-parameters/index.md @@ -1654,7 +1654,7 @@ The `alertmanager` block configures the alertmanager. # (advanced) Maximum size (bytes) of an accepted HTTP request body. # CLI flag: -alertmanager.max-recv-msg-size -[max_recv_msg_size: | default = 16777216] +[max_recv_msg_size: | default = 104857600] sharding_ring: # The key-value store used to share the hash ring across multiple instances. @@ -1748,11 +1748,56 @@ sharding_ring: # CLI flag: -alertmanager.enable-api [enable_api: | default = true] +# (advanced) Maximum number of concurrent GET requests allowed per tenant. The +# zero value (and negative values) result in a limit of GOMAXPROCS or 8, +# whichever is larger. Status code 503 is served for GET requests that would +# exceed the concurrency limit. +# CLI flag: -alertmanager.max-concurrent-get-requests-per-tenant +[max_concurrent_get_requests_per_tenant: | default = 0] + alertmanager_client: # (advanced) Timeout for downstream alertmanagers. # CLI flag: -alertmanager.alertmanager-client.remote-timeout [remote_timeout: | default = 2s] + # (advanced) gRPC client max receive message size (bytes). + # CLI flag: -alertmanager.alertmanager-client.grpc-max-recv-msg-size + [max_recv_msg_size: | default = 104857600] + + # (advanced) gRPC client max send message size (bytes). + # CLI flag: -alertmanager.alertmanager-client.grpc-max-send-msg-size + [max_send_msg_size: | default = 104857600] + + # (advanced) Use compression when sending messages. Supported values are: + # 'gzip', 'snappy' and '' (disable compression) + # CLI flag: -alertmanager.alertmanager-client.grpc-compression + [grpc_compression: | default = ""] + + # (advanced) Rate limit for gRPC client; 0 means disabled. + # CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit + [rate_limit: | default = 0] + + # (advanced) Rate limit burst for gRPC client. + # CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit-burst + [rate_limit_burst: | default = 0] + + # (advanced) Enable backoff and retry when we hit ratelimits. + # CLI flag: -alertmanager.alertmanager-client.backoff-on-ratelimits + [backoff_on_ratelimits: | default = false] + + backoff_config: + # (advanced) Minimum delay when backing off. + # CLI flag: -alertmanager.alertmanager-client.backoff-min-period + [min_period: | default = 100ms] + + # (advanced) Maximum delay when backing off. + # CLI flag: -alertmanager.alertmanager-client.backoff-max-period + [max_period: | default = 10s] + + # (advanced) Number of times to backoff and retry before failing. + # CLI flag: -alertmanager.alertmanager-client.backoff-retries + [max_retries: | default = 10] + # (advanced) Enable TLS in the GRPC client. This flag needs to be enabled when # any other TLS flag is set. If set to false, insecure connection to gRPC # server will be used. diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 5f201dcba0f..17503e119f9 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -68,12 +68,13 @@ const ( // Config configures an Alertmanager. type Config struct { - UserID string - Logger log.Logger - PeerTimeout time.Duration - Retention time.Duration - ExternalURL *url.URL - Limits Limits + UserID string + Logger log.Logger + PeerTimeout time.Duration + Retention time.Duration + MaxConcurrentGetRequestsPerTenant int + ExternalURL *url.URL + Limits Limits // Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed. TenantDataDir string @@ -244,9 +245,10 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { } am.api, err = api.New(api.Options{ - Alerts: am.alerts, - Silences: am.silences, - StatusFunc: am.marker.Status, + Alerts: am.alerts, + Silences: am.silences, + StatusFunc: am.marker.Status, + Concurrency: cfg.MaxConcurrentGetRequestsPerTenant, // Mimir should not expose cluster information back to its tenants. Peer: &NilPeer{}, Registry: am.registry, diff --git a/pkg/alertmanager/alertmanager_client.go b/pkg/alertmanager/alertmanager_client.go index 28a6d394137..bb69c69ecad 100644 --- a/pkg/alertmanager/alertmanager_client.go +++ b/pkg/alertmanager/alertmanager_client.go @@ -10,7 +10,6 @@ import ( "time" "github.com/go-kit/log" - "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/grpcclient" "github.com/grafana/dskit/ring/client" "github.com/pkg/errors" @@ -39,16 +38,14 @@ type Client interface { // ClientConfig is the configuration struct for the alertmanager client. type ClientConfig struct { - RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"` - TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` - TLS tls.ClientConfig `yaml:",inline"` + RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"` + GRPCClientConfig grpcclient.Config `yaml:",inline"` } // RegisterFlagsWithPrefix registers flags with prefix. func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { - f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS in the GRPC client. This flag needs to be enabled when any other TLS flag is set. If set to false, insecure connection to gRPC server will be used.") + cfg.GRPCClientConfig.RegisterFlagsWithPrefix(prefix, f) f.DurationVar(&cfg.RemoteTimeout, prefix+".remote-timeout", 2*time.Second, "Timeout for downstream alertmanagers.") - cfg.TLS.RegisterFlagsWithPrefix(prefix, f) } // alertmanagerClientsPool is a pool of alertmanager clients. @@ -58,18 +55,6 @@ type alertmanagerClientsPool struct { // newAlertmanagerClientsPool creates a new pool of alertmanager clients. func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientCfg ClientConfig, logger log.Logger, reg prometheus.Registerer) ClientsPool { - // We prefer sensible defaults instead of exposing further config options. - grpcCfg := grpcclient.Config{ - MaxRecvMsgSize: 16 * 1024 * 1024, // 16MiB. - MaxSendMsgSize: 4 * 1024 * 1024, // 4MiB. - GRPCCompression: "", // No compression. - RateLimit: 0, // No rate limit. - RateLimitBurst: 0, // No burst of rate limit. - BackoffOnRatelimits: false, // No backoffs for rate limiting. - TLSEnabled: amClientCfg.TLSEnabled, - TLS: amClientCfg.TLS, - } - requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ Name: "cortex_alertmanager_distributor_client_request_duration_seconds", Help: "Time spent executing requests from an alertmanager to another alertmanager.", @@ -86,7 +71,7 @@ func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientC }, []string{"operation", "status_code"}) factory := func(addr string) (client.PoolClient, error) { - return dialAlertmanagerClient(grpcCfg, addr, requestDuration) + return dialAlertmanagerClient(amClientCfg.GRPCClientConfig, addr, requestDuration) } poolCfg := client.PoolConfig{ diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 83bc5e9c3e5..b062847b987 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -79,6 +79,8 @@ type MultitenantAlertmanagerConfig struct { EnableAPI bool `yaml:"enable_api" category:"advanced"` + MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` + // For distributor. AlertmanagerClient ClientConfig `yaml:"alertmanager_client"` @@ -94,7 +96,7 @@ const ( func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&cfg.DataDir, "alertmanager.storage.path", "./data-alertmanager/", "Directory to store Alertmanager state and temporarily configuration files. The content of this directory is not required to be persisted between restarts unless Alertmanager replication has been disabled.") f.DurationVar(&cfg.Retention, "alertmanager.storage.retention", 5*24*time.Hour, "How long to keep data for.") - f.Int64Var(&cfg.MaxRecvMsgSize, "alertmanager.max-recv-msg-size", 16<<20, "Maximum size (bytes) of an accepted HTTP request body.") + f.Int64Var(&cfg.MaxRecvMsgSize, "alertmanager.max-recv-msg-size", 100<<20, "Maximum size (bytes) of an accepted HTTP request body.") _ = cfg.ExternalURL.Set("http://localhost:8080/alertmanager") // set the default f.Var(&cfg.ExternalURL, "alertmanager.web.external-url", "The URL under which Alertmanager is externally reachable (eg. could be different than -http.alertmanager-http-prefix in case Alertmanager is served via a reverse proxy). This setting is used both to configure the internal requests router and to generate links in alert templates. If the external URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager, both the UI and API.") @@ -103,6 +105,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger f.DurationVar(&cfg.PollInterval, "alertmanager.configs.poll-interval", 15*time.Second, "How frequently to poll Alertmanager configs.") f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") + f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f) cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f) @@ -842,17 +845,18 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco } newAM, err := New(&Config{ - UserID: userID, - TenantDataDir: tenantDir, - Logger: am.logger, - PeerTimeout: am.cfg.PeerTimeout, - Retention: am.cfg.Retention, - ExternalURL: am.cfg.ExternalURL.URL, - Replicator: am, - ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor, - Store: am.store, - PersisterConfig: am.cfg.Persister, - Limits: am.limits, + UserID: userID, + TenantDataDir: tenantDir, + Logger: am.logger, + PeerTimeout: am.cfg.PeerTimeout, + Retention: am.cfg.Retention, + MaxConcurrentGetRequestsPerTenant: am.cfg.MaxConcurrentGetRequestsPerTenant, + ExternalURL: am.cfg.ExternalURL.URL, + Replicator: am, + ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor, + Store: am.store, + PersisterConfig: am.cfg.Persister, + Limits: am.limits, }, reg) if err != nil { return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)