Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add configuration for Alertmanager GRPC client and GET request concurrency #1547

Merged
merged 7 commits into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@
* [CHANGE] Compactor: No longer upload debug meta files to object storage. #1257
* [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474
* [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052
* [FEATURE] Alertmanager API: Concurrency limit for GET requests is now configurable using `-alertmanager.max-concurrent-get-requests-per-tenant`. #1547
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved
* [ENHANCEMENT] Alertmanager: Added the ability to configure GRPC client settings #1547
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved
- `-alertmanager.alertmanager-client.backoff-max-period duration`
- `-alertmanager.alertmanager-client.backoff-min-period duration`
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved
- `-alertmanager.alertmanager-client.backoff-on-ratelimits`
- `-alertmanager.alertmanager-client.backoff-retries`
- `-alertmanager.alertmanager-client.grpc-client-rate-limit`
- `-alertmanager.alertmanager-client.grpc-client-rate-limit-burst`
- `-alertmanager.alertmanager-client.grpc-compression`
- `-alertmanager.alertmanager-client.grpc-max-recv-msg-size`
- `-alertmanager.alertmanager-client.grpc-max-send-msg-size`
- Default values have also changed for the following settings:
- `-alertmanager.alertmanager-client.grpc-max-recv-msg-size` now defaults to 100 MiB (previously was not configurable and set to 16 MiB)
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved
- `-alertmanager.alertmanager-client.grpc-max-send-msg-size` now defaults to 100 MiB (previously was not configurable and set to 4 MiB)
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved
* [ENHANCEMENT] Ruler: Add more detailed query information to ruler query stats logging. #1411
* [ENHANCEMENT] Admin: Admin API now has some styling. #1482
* [BUGFIX] Query-frontend: do not shard queries with a subquery unless the subquery is inside a shardable aggregation function call. #1542
Expand Down
120 changes: 120 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -8387,6 +8387,17 @@
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
"required": false,
"desc": "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.max-concurrent-get-requests-per-tenant",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "block",
"name": "alertmanager_client",
Expand All @@ -8404,6 +8415,115 @@
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_recv_msg_size",
"required": false,
"desc": "gRPC client max receive message size (bytes).",
"fieldValue": null,
"fieldDefaultValue": 104857600,
"fieldFlag": "alertmanager.alertmanager-client.grpc-max-recv-msg-size",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_send_msg_size",
"required": false,
"desc": "gRPC client max send message size (bytes).",
"fieldValue": null,
"fieldDefaultValue": 104857600,
"fieldFlag": "alertmanager.alertmanager-client.grpc-max-send-msg-size",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "grpc_compression",
"required": false,
"desc": "Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.alertmanager-client.grpc-compression",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "rate_limit",
"required": false,
"desc": "Rate limit for gRPC client; 0 means disabled.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit",
"fieldType": "float",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "rate_limit_burst",
"required": false,
"desc": "Rate limit burst for gRPC client.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "alertmanager.alertmanager-client.grpc-client-rate-limit-burst",
"fieldType": "int",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "backoff_on_ratelimits",
"required": false,
"desc": "Enable backoff and retry when we hit ratelimits.",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldFlag": "alertmanager.alertmanager-client.backoff-on-ratelimits",
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "block",
"name": "backoff_config",
"required": false,
"desc": "",
"blockEntries": [
{
"kind": "field",
"name": "min_period",
"required": false,
"desc": "Minimum delay when backing off.",
"fieldValue": null,
"fieldDefaultValue": 100000000,
"fieldFlag": "alertmanager.alertmanager-client.backoff-min-period",
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_period",
"required": false,
"desc": "Maximum delay when backing off.",
"fieldValue": null,
"fieldDefaultValue": 10000000000,
"fieldFlag": "alertmanager.alertmanager-client.backoff-max-period",
"fieldType": "duration",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "max_retries",
"required": false,
"desc": "Number of times to backoff and retry before failing.",
"fieldValue": null,
"fieldDefaultValue": 10,
"fieldFlag": "alertmanager.alertmanager-client.backoff-retries",
"fieldType": "int",
"fieldCategory": "advanced"
}
],
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "tls_enabled",
Expand Down
20 changes: 20 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,24 @@ Usage of ./cmd/mimir/mimir:
OpenStack Swift user ID.
-alertmanager-storage.swift.username string
OpenStack Swift username.
-alertmanager.alertmanager-client.backoff-max-period duration
Maximum delay when backing off. (default 10s)
-alertmanager.alertmanager-client.backoff-min-period duration
Minimum delay when backing off. (default 100ms)
-alertmanager.alertmanager-client.backoff-on-ratelimits
Enable backoff and retry when we hit ratelimits.
-alertmanager.alertmanager-client.backoff-retries int
Number of times to backoff and retry before failing. (default 10)
-alertmanager.alertmanager-client.grpc-client-rate-limit float
Rate limit for gRPC client; 0 means disabled.
-alertmanager.alertmanager-client.grpc-client-rate-limit-burst int
Rate limit burst for gRPC client.
-alertmanager.alertmanager-client.grpc-compression string
Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)
-alertmanager.alertmanager-client.grpc-max-recv-msg-size int
gRPC client max receive message size (bytes). (default 104857600)
-alertmanager.alertmanager-client.grpc-max-send-msg-size int
gRPC client max send message size (bytes). (default 104857600)
-alertmanager.alertmanager-client.remote-timeout duration
Timeout for downstream alertmanagers. (default 2s)
-alertmanager.alertmanager-client.tls-ca-path string
Expand All @@ -123,6 +141,8 @@ Usage of ./cmd/mimir/mimir:
Maximum number of alerts that a single tenant can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.
-alertmanager.max-alerts-size-bytes int
Maximum total size of alerts that a single tenant can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.
-alertmanager.max-concurrent-get-requests-per-tenant int
Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.
-alertmanager.max-config-size-bytes int
Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit.
-alertmanager.max-dispatcher-aggregation-groups int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1748,11 +1748,56 @@ sharding_ring:
# CLI flag: -alertmanager.enable-api
[enable_api: <boolean> | default = true]

# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
# exceed the concurrency limit.
# CLI flag: -alertmanager.max-concurrent-get-requests-per-tenant
[max_concurrent_get_requests_per_tenant: <int> | default = 0]

alertmanager_client:
# (advanced) Timeout for downstream alertmanagers.
# CLI flag: -alertmanager.alertmanager-client.remote-timeout
[remote_timeout: <duration> | default = 2s]

# (advanced) gRPC client max receive message size (bytes).
# CLI flag: -alertmanager.alertmanager-client.grpc-max-recv-msg-size
[max_recv_msg_size: <int> | default = 104857600]

# (advanced) gRPC client max send message size (bytes).
# CLI flag: -alertmanager.alertmanager-client.grpc-max-send-msg-size
[max_send_msg_size: <int> | default = 104857600]
Logiraptor marked this conversation as resolved.
Show resolved Hide resolved

# (advanced) Use compression when sending messages. Supported values are:
# 'gzip', 'snappy' and '' (disable compression)
# CLI flag: -alertmanager.alertmanager-client.grpc-compression
[grpc_compression: <string> | default = ""]

# (advanced) Rate limit for gRPC client; 0 means disabled.
# CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit
[rate_limit: <float> | default = 0]

# (advanced) Rate limit burst for gRPC client.
# CLI flag: -alertmanager.alertmanager-client.grpc-client-rate-limit-burst
[rate_limit_burst: <int> | default = 0]

# (advanced) Enable backoff and retry when we hit ratelimits.
# CLI flag: -alertmanager.alertmanager-client.backoff-on-ratelimits
[backoff_on_ratelimits: <boolean> | default = false]

backoff_config:
# (advanced) Minimum delay when backing off.
# CLI flag: -alertmanager.alertmanager-client.backoff-min-period
[min_period: <duration> | default = 100ms]

# (advanced) Maximum delay when backing off.
# CLI flag: -alertmanager.alertmanager-client.backoff-max-period
[max_period: <duration> | default = 10s]

# (advanced) Number of times to backoff and retry before failing.
# CLI flag: -alertmanager.alertmanager-client.backoff-retries
[max_retries: <int> | default = 10]

# (advanced) Enable TLS in the GRPC client. This flag needs to be enabled when
# any other TLS flag is set. If set to false, insecure connection to gRPC
# server will be used.
Expand Down
20 changes: 11 additions & 9 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,13 @@ const (

// Config configures an Alertmanager.
type Config struct {
UserID string
Logger log.Logger
PeerTimeout time.Duration
Retention time.Duration
ExternalURL *url.URL
Limits Limits
UserID string
Logger log.Logger
PeerTimeout time.Duration
Retention time.Duration
MaxConcurrentGetRequestsPerTenant int
ExternalURL *url.URL
Limits Limits

// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
TenantDataDir string
Expand Down Expand Up @@ -244,9 +245,10 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
}

am.api, err = api.New(api.Options{
Alerts: am.alerts,
Silences: am.silences,
StatusFunc: am.marker.Status,
Alerts: am.alerts,
Silences: am.silences,
StatusFunc: am.marker.Status,
Concurrency: cfg.MaxConcurrentGetRequestsPerTenant,
// Mimir should not expose cluster information back to its tenants.
Peer: &NilPeer{},
Registry: am.registry,
Expand Down
23 changes: 4 additions & 19 deletions pkg/alertmanager/alertmanager_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"time"

"github.com/go-kit/log"
"github.com/grafana/dskit/crypto/tls"
"github.com/grafana/dskit/grpcclient"
"github.com/grafana/dskit/ring/client"
"github.com/pkg/errors"
Expand Down Expand Up @@ -39,16 +38,14 @@ type Client interface {

// ClientConfig is the configuration struct for the alertmanager client.
type ClientConfig struct {
RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"`
TLSEnabled bool `yaml:"tls_enabled" category:"advanced"`
TLS tls.ClientConfig `yaml:",inline"`
RemoteTimeout time.Duration `yaml:"remote_timeout" category:"advanced"`
GRPCClientConfig grpcclient.Config `yaml:",inline"`
}

// RegisterFlagsWithPrefix registers flags with prefix.
func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS in the GRPC client. This flag needs to be enabled when any other TLS flag is set. If set to false, insecure connection to gRPC server will be used.")
cfg.GRPCClientConfig.RegisterFlagsWithPrefix(prefix, f)
f.DurationVar(&cfg.RemoteTimeout, prefix+".remote-timeout", 2*time.Second, "Timeout for downstream alertmanagers.")
cfg.TLS.RegisterFlagsWithPrefix(prefix, f)
}

// alertmanagerClientsPool is a pool of alertmanager clients.
Expand All @@ -58,18 +55,6 @@ type alertmanagerClientsPool struct {

// newAlertmanagerClientsPool creates a new pool of alertmanager clients.
func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientCfg ClientConfig, logger log.Logger, reg prometheus.Registerer) ClientsPool {
// We prefer sensible defaults instead of exposing further config options.
grpcCfg := grpcclient.Config{
MaxRecvMsgSize: 16 * 1024 * 1024, // 16MiB.
MaxSendMsgSize: 4 * 1024 * 1024, // 4MiB.
GRPCCompression: "", // No compression.
RateLimit: 0, // No rate limit.
RateLimitBurst: 0, // No burst of rate limit.
BackoffOnRatelimits: false, // No backoffs for rate limiting.
TLSEnabled: amClientCfg.TLSEnabled,
TLS: amClientCfg.TLS,
}

requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "cortex_alertmanager_distributor_client_request_duration_seconds",
Help: "Time spent executing requests from an alertmanager to another alertmanager.",
Expand All @@ -86,7 +71,7 @@ func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientC
}, []string{"operation", "status_code"})

factory := func(addr string) (client.PoolClient, error) {
return dialAlertmanagerClient(grpcCfg, addr, requestDuration)
return dialAlertmanagerClient(amClientCfg.GRPCClientConfig, addr, requestDuration)
}

poolCfg := client.PoolConfig{
Expand Down
26 changes: 15 additions & 11 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ type MultitenantAlertmanagerConfig struct {

EnableAPI bool `yaml:"enable_api" category:"advanced"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

// For distributor.
AlertmanagerClient ClientConfig `yaml:"alertmanager_client"`

Expand All @@ -103,6 +105,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger
f.DurationVar(&cfg.PollInterval, "alertmanager.configs.poll-interval", 15*time.Second, "How frequently to poll Alertmanager configs.")

f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f)
cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f)
Expand Down Expand Up @@ -842,17 +845,18 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco
}

newAM, err := New(&Config{
UserID: userID,
TenantDataDir: tenantDir,
Logger: am.logger,
PeerTimeout: am.cfg.PeerTimeout,
Retention: am.cfg.Retention,
ExternalURL: am.cfg.ExternalURL.URL,
Replicator: am,
ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor,
Store: am.store,
PersisterConfig: am.cfg.Persister,
Limits: am.limits,
UserID: userID,
TenantDataDir: tenantDir,
Logger: am.logger,
PeerTimeout: am.cfg.PeerTimeout,
Retention: am.cfg.Retention,
MaxConcurrentGetRequestsPerTenant: am.cfg.MaxConcurrentGetRequestsPerTenant,
ExternalURL: am.cfg.ExternalURL.URL,
Replicator: am,
ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor,
Store: am.store,
PersisterConfig: am.cfg.Persister,
Limits: am.limits,
}, reg)
if err != nil {
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)
Expand Down