Skip to content

Commit

Permalink
Add configuration for Alertmanager GRPC client and GET request concur…
Browse files Browse the repository at this point in the history
…rency (#1547)

* Add configuration for Alertmanager GRPC client and GET request concurrency

Tuning these parameters is useful when alertmanager is under heavy load

* Update CHANGELOG

* Address PR feedback

* Update CHANGELOG.md

Co-authored-by: Marco Pracucci <[email protected]>

* Update CHANGELOG.md

Co-authored-by: Marco Pracucci <[email protected]>

* Update CHANGELOG.md

Co-authored-by: gotjosh <[email protected]>

* Address PR feedback

Co-authored-by: Marco Pracucci <[email protected]>
Co-authored-by: gotjosh <[email protected]>
  • Loading branch information
3 people authored Mar 31, 2022
1 parent 0db680b commit 180ff05
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 43 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,23 @@
## Grafana Mimir - main / unreleased

* [CHANGE] Compactor: No longer upload debug meta files to object storage. #1257
* [CHANGE] Default values have changed for the following settings: #1547
- `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` now defaults to 100 MiB (previously was not configurable and set to 16 MiB)
- `-alertmanager.alertmanager-client.grpc-max-send-msg-size` now defaults to 100 MiB (previously was not configurable and set to 4 MiB)
- `-alertmanager.max-recv-msg-size` now defaults to 100 MiB (previously was 16 MiB)
* [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474
* [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052
* [ENHANCEMENT] Alertmanager API: Concurrency limit for GET requests is now configurable using `-alertmanager.max-concurrent-get-requests-per-tenant`. #1547
* [ENHANCEMENT] Alertmanager: Added the ability to configure additional gRPC client settings for the Alertmanager distributor #1547
- `-alertmanager.alertmanager-client.backoff-max-period`
- `-alertmanager.alertmanager-client.backoff-min-period`
- `-alertmanager.alertmanager-client.backoff-on-ratelimits`
- `-alertmanager.alertmanager-client.backoff-retries`
- `-alertmanager.alertmanager-client.grpc-client-rate-limit`
- `-alertmanager.alertmanager-client.grpc-client-rate-limit-burst`
- `-alertmanager.alertmanager-client.grpc-compression`
- `-alertmanager.alertmanager-client.grpc-max-recv-msg-size`
- `-alertmanager.alertmanager-client.grpc-max-send-msg-size`
* [ENHANCEMENT] Ruler: Add more detailed query information to ruler query stats logging. #1411
* [ENHANCEMENT] Admin: Admin API now has some styling. #1482 #1549
* [ENHANCEMENT] Alertmanager: added `insight=true` field to alertmanager dispatch logs. #1379
Expand Down
122 changes: 121 additions & 1 deletion cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -7953,7 +7953,7 @@
"required": false,
"desc": "Maximum size (bytes) of an accepted HTTP request body.",
"fieldValue": null,
"fieldDefaultValue": 16777216,
"fieldDefaultValue": 104857600,
"fieldFlag": "alertmanager.max-recv-msg-size",
"fieldType": "int",
"fieldCategory": "advanced"
Expand Down Expand Up @@ -8387,6 +8387,17 @@
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
"required": false,
"desc": "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.max-concurrent-get-requests-per-tenant",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "block",
"name": "alertmanager_client",
Expand All @@ -8404,6 +8415,115 @@
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_recv_msg_size",
"required": false,
"desc": "gRPC client max receive message size (bytes).",
"fieldValue": null,
"fieldDefaultValue": 104857600,
"fieldFlag": "alertmanager.alertmanager-client.grpc-max-recv-msg-size",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_send_msg_size",
"required": false,
"desc": "gRPC client max send message size (bytes).",
"fieldValue": null,
"fieldDefaultValue": 104857600,
"fieldFlag": "alertmanager.alertmanager-client.grpc-max-send-msg-size",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "grpc_compression",
"required": false,
"desc": "Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.alertmanager-client.grpc-compression",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "rate_limit",
"required": false,
"desc": "Rate limit for gRPC client; 0 means disabled.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit",
"fieldType": "float",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "rate_limit_burst",
"required": false,
"desc": "Rate limit burst for gRPC client.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit-burst",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "backoff_on_ratelimits",
"required": false,
"desc": "Enable backoff and retry when we hit ratelimits.",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldFlag": "alertmanager.alertmanager-client.backoff-on-ratelimits",
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "block",
"name": "backoff_config",
"required": false,
"desc": "",
"blockEntries": [
{
"kind": "field",
"name": "min_period",
"required": false,
"desc": "Minimum delay when backing off.",
"fieldValue": null,
"fieldDefaultValue": 100000000,
"fieldFlag": "alertmanager.alertmanager-client.backoff-min-period",
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_period",
"required": false,
"desc": "Maximum delay when backing off.",
"fieldValue": null,
"fieldDefaultValue": 10000000000,
"fieldFlag": "alertmanager.alertmanager-client.backoff-max-period",
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_retries",
"required": false,
"desc": "Number of times to backoff and retry before failing.",
"fieldValue": null,
"fieldDefaultValue": 10,
"fieldFlag": "alertmanager.alertmanager-client.backoff-retries",
"fieldType": "int",
"fieldCategory": "advanced"
}
],
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "tls_enabled",
Expand Down
22 changes: 21 additions & 1 deletion cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,24 @@ Usage of ./cmd/mimir/mimir:
OpenStack Swift user ID.
-alertmanager-storage.swift.username string
OpenStack Swift username.
-alertmanager.alertmanager-client.backoff-max-period duration
Maximum delay when backing off. (default 10s)
-alertmanager.alertmanager-client.backoff-min-period duration
Minimum delay when backing off. (default 100ms)
-alertmanager.alertmanager-client.backoff-on-ratelimits
Enable backoff and retry when we hit ratelimits.
-alertmanager.alertmanager-client.backoff-retries int
Number of times to backoff and retry before failing. (default 10)
-alertmanager.alertmanager-client.grpc-client-rate-limit float
Rate limit for gRPC client; 0 means disabled.
-alertmanager.alertmanager-client.grpc-client-rate-limit-burst int
Rate limit burst for gRPC client.
-alertmanager.alertmanager-client.grpc-compression string
Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)
-alertmanager.alertmanager-client.grpc-max-recv-msg-size int
gRPC client max receive message size (bytes). (default 104857600)
-alertmanager.alertmanager-client.grpc-max-send-msg-size int
gRPC client max send message size (bytes). (default 104857600)
-alertmanager.alertmanager-client.remote-timeout duration
Timeout for downstream alertmanagers. (default 2s)
-alertmanager.alertmanager-client.tls-ca-path string
Expand All @@ -123,12 +141,14 @@ Usage of ./cmd/mimir/mimir:
Maximum number of alerts that a single tenant can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.
-alertmanager.max-alerts-size-bytes int
Maximum total size of alerts that a single tenant can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.
-alertmanager.max-concurrent-get-requests-per-tenant int
Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.
-alertmanager.max-config-size-bytes int
Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit.
-alertmanager.max-dispatcher-aggregation-groups int
Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.
-alertmanager.max-recv-msg-size int
Maximum size (bytes) of an accepted HTTP request body. (default 16777216)
Maximum size (bytes) of an accepted HTTP request body. (default 104857600)
-alertmanager.max-template-size-bytes int
Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.
-alertmanager.max-templates-count int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1654,7 +1654,7 @@ The `alertmanager` block configures the alertmanager.
# (advanced) Maximum size (bytes) of an accepted HTTP request body.
# CLI flag: -alertmanager.max-recv-msg-size
[max_recv_msg_size: <int> | default = 16777216]
[max_recv_msg_size: <int> | default = 104857600]
sharding_ring:
# The key-value store used to share the hash ring across multiple instances.
Expand Down Expand Up @@ -1748,11 +1748,56 @@ sharding_ring:
# CLI flag: -alertmanager.enable-api
[enable_api: <boolean> | default = true]
# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
# exceed the concurrency limit.
# CLI flag: -alertmanager.max-concurrent-get-requests-per-tenant
[max_concurrent_get_requests_per_tenant: <int> | default = 0]
alertmanager_client:
# (advanced) Timeout for downstream alertmanagers.
# CLI flag: -alertmanager.alertmanager-client.remote-timeout
[remote_timeout: <duration> | default = 2s]
# (advanced) gRPC client max receive message size (bytes).
# CLI flag: -alertmanager.alertmanager-client.grpc-max-recv-msg-size
[max_recv_msg_size: <int> | default = 104857600]
# (advanced) gRPC client max send message size (bytes).
# CLI flag: -alertmanager.alertmanager-client.grpc-max-send-msg-size
[max_send_msg_size: <int> | default = 104857600]
# (advanced) Use compression when sending messages. Supported values are:
# 'gzip', 'snappy' and '' (disable compression)
# CLI flag: -alertmanager.alertmanager-client.grpc-compression
[grpc_compression: <string> | default = ""]
# (advanced) Rate limit for gRPC client; 0 means disabled.
# CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit
[rate_limit: <float> | default = 0]
# (advanced) Rate limit burst for gRPC client.
# CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit-burst
[rate_limit_burst: <int> | default = 0]
# (advanced) Enable backoff and retry when we hit ratelimits.
# CLI flag: -alertmanager.alertmanager-client.backoff-on-ratelimits
[backoff_on_ratelimits: <boolean> | default = false]
backoff_config:
# (advanced) Minimum delay when backing off.
# CLI flag: -alertmanager.alertmanager-client.backoff-min-period
[min_period: <duration> | default = 100ms]
# (advanced) Maximum delay when backing off.
# CLI flag: -alertmanager.alertmanager-client.backoff-max-period
[max_period: <duration> | default = 10s]
# (advanced) Number of times to backoff and retry before failing.
# CLI flag: -alertmanager.alertmanager-client.backoff-retries
[max_retries: <int> | default = 10]
# (advanced) Enable TLS in the GRPC client. This flag needs to be enabled when
# any other TLS flag is set. If set to false, insecure connection to gRPC
# server will be used.
Expand Down
20 changes: 11 additions & 9 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,13 @@ const (

// Config configures an Alertmanager.
type Config struct {
UserID string
Logger log.Logger
PeerTimeout time.Duration
Retention time.Duration
ExternalURL *url.URL
Limits Limits
UserID string
Logger log.Logger
PeerTimeout time.Duration
Retention time.Duration
MaxConcurrentGetRequestsPerTenant int
ExternalURL *url.URL
Limits Limits

// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
TenantDataDir string
Expand Down Expand Up @@ -244,9 +245,10 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
}

am.api, err = api.New(api.Options{
Alerts: am.alerts,
Silences: am.silences,
StatusFunc: am.marker.Status,
Alerts: am.alerts,
Silences: am.silences,
StatusFunc: am.marker.Status,
Concurrency: cfg.MaxConcurrentGetRequestsPerTenant,
// Mimir should not expose cluster information back to its tenants.
Peer: &NilPeer{},
Registry: am.registry,
Expand Down
23 changes: 4 additions & 19 deletions pkg/alertmanager/alertmanager_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"time"

"github.com/go-kit/log"
"github.com/grafana/dskit/crypto/tls"
"github.com/grafana/dskit/grpcclient"
"github.com/grafana/dskit/ring/client"
"github.com/pkg/errors"
Expand Down Expand Up @@ -39,16 +38,14 @@ type Client interface {

// ClientConfig is the configuration struct for the alertmanager client.
type ClientConfig struct {
RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"`
TLSEnabled bool `yaml:"tls_enabled" category:"advanced"`
TLS tls.ClientConfig `yaml:",inline"`
RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"`
GRPCClientConfig grpcclient.Config `yaml:",inline"`
}

// RegisterFlagsWithPrefix registers flags with prefix.
func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS in the GRPC client. This flag needs to be enabled when any other TLS flag is set. If set to false, insecure connection to gRPC server will be used.")
cfg.GRPCClientConfig.RegisterFlagsWithPrefix(prefix, f)
f.DurationVar(&cfg.RemoteTimeout, prefix+".remote-timeout", 2*time.Second, "Timeout for downstream alertmanagers.")
cfg.TLS.RegisterFlagsWithPrefix(prefix, f)
}

// alertmanagerClientsPool is a pool of alertmanager clients.
Expand All @@ -58,18 +55,6 @@ type alertmanagerClientsPool struct {

// newAlertmanagerClientsPool creates a new pool of alertmanager clients.
func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientCfg ClientConfig, logger log.Logger, reg prometheus.Registerer) ClientsPool {
// We prefer sensible defaults instead of exposing further config options.
grpcCfg := grpcclient.Config{
MaxRecvMsgSize: 16 * 1024 * 1024, // 16MiB.
MaxSendMsgSize: 4 * 1024 * 1024, // 4MiB.
GRPCCompression: "", // No compression.
RateLimit: 0, // No rate limit.
RateLimitBurst: 0, // No burst of rate limit.
BackoffOnRatelimits: false, // No backoffs for rate limiting.
TLSEnabled: amClientCfg.TLSEnabled,
TLS: amClientCfg.TLS,
}

requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "cortex_alertmanager_distributor_client_request_duration_seconds",
Help: "Time spent executing requests from an alertmanager to another alertmanager.",
Expand All @@ -86,7 +71,7 @@ func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientC
}, []string{"operation", "status_code"})

factory := func(addr string) (client.PoolClient, error) {
return dialAlertmanagerClient(grpcCfg, addr, requestDuration)
return dialAlertmanagerClient(amClientCfg.GRPCClientConfig, addr, requestDuration)
}

poolCfg := client.PoolConfig{
Expand Down
Loading

0 comments on commit 180ff05

Please sign in to comment.