From bce916ae33796fdf0d68e67561cf1b5b5cf7b740 Mon Sep 17 00:00:00 2001 From: zenador Date: Fri, 3 Jun 2022 16:08:47 +0800 Subject: [PATCH 1/7] add validation.RateLimited to error catalogue --- CHANGELOG.md | 2 +- .../operators-guide/mimir-runbooks/_index.md | 8 ++++++++ pkg/distributor/distributor.go | 12 +++++------ pkg/distributor/distributor_test.go | 18 ++++++++--------- pkg/util/globalerror/errors.go | 20 ++++++++++++++++++- pkg/util/validation/errors.go | 12 +++++++++++ pkg/util/validation/limits.go | 12 +++++++---- pkg/util/validation/validate.go | 8 ++++---- 8 files changed, 67 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8064c7e4b84..85dde2c2845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ * `cortex_discarded_requests_total` * [CHANGE] Blocks uploaded by ingester no longer contain `__org_id__` label. Compactor now ignores this label and will compact blocks with and without this label together. `mimirconvert` tool will remove the label from blocks as "unknown" label. #1972 * [ENHANCEMENT] Store-gateway: Add the experimental ability to run requests in a dedicated OS thread pool. This feature can be configured using `-store-gateway.thread-pool-size` and is disabled by default. Replaces the ability to run index header operations in a dedicated thread pool. #1660 #1812 -* [ENHANCEMENT] Improved error messages to make them easier to understand; each now have a unique, global identifier that you can use to look up in the runbooks for more information. #1907 #1919 #1888 #1939 #1984 +* [ENHANCEMENT] Improved error messages to make them easier to understand; each now have a unique, global identifier that you can use to look up in the runbooks for more information. #1907 #1919 #1888 #1939 #1984 #2009 * [ENHANCEMENT] Memberlist KV: incoming messages are now processed on per-key goroutine. This may reduce loss of "maintanance" packets in busy memberlist installations, but use more CPU. New `memberlist_client_received_broadcasts_dropped_total` counter tracks number of dropped per-key messages. #1912 * [ENHANCEMENT] Blocks Storage, Alertmanager, Ruler: add support a prefix to the bucket store (`*_storage.storage_prefix`). This enables using the same bucket for the three components. #1686 #1951 * [BUGFIX] Fix regexp parsing panic for regexp label matchers with start/end quantifiers. #1883 diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index 6f5b5939b4e..15457bd6001 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -1345,6 +1345,14 @@ Mimir has a limit on the query length. This limit is applied to partial queries, after they've split (according to time) by the query-frontend. This limit protects the system’s stability from potential abuse or mistakes. To configure the limit on a per-tenant basis, use the `-store.max-query-length` option (or `max_query_length` in the runtime configuration). +### err-mimir-request-rate-limited + +TBA + +### err-mimir-ingestion-rate-limited + +TBA + ## Mimir routes by path **Write path**: diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 61a82a9e8e8..b274c85c9bc 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -593,12 +593,12 @@ func (d *Distributor) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReq now := mtime.Now() if !d.requestRateLimiter.AllowN(now, userID, 1) { - validation.DiscardedRequests.WithLabelValues(validation.RateLimited, userID).Add(1) + validation.DiscardedRequests.WithLabelValues(validation.ReasonRateLimited, userID).Add(1) // Return a 429 here to tell the client it is going too fast. // Client may discard the data or slow down and re-send. // Prometheus v2.26 added a remote-write option 'retry_on_http_429'. - return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "request rate limit (%v) exceeded", d.requestRateLimiter.Limit(now, userID)) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(d.requestRateLimiter.Limit(now, userID), d.requestRateLimiter.Burst(now, userID)).Error()) } d.activeUsers.UpdateUserTimestamp(userID, now) @@ -789,13 +789,13 @@ func (d *Distributor) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReq totalN := validatedSamples + validatedExemplars + len(validatedMetadata) if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { - validation.DiscardedSamples.WithLabelValues(validation.RateLimited, userID).Add(float64(validatedSamples)) - validation.DiscardedExemplars.WithLabelValues(validation.RateLimited, userID).Add(float64(validatedExemplars)) - validation.DiscardedMetadata.WithLabelValues(validation.RateLimited, userID).Add(float64(len(validatedMetadata))) + validation.DiscardedSamples.WithLabelValues(validation.ReasonRateLimited, userID).Add(float64(validatedSamples)) + validation.DiscardedExemplars.WithLabelValues(validation.ReasonRateLimited, userID).Add(float64(validatedExemplars)) + validation.DiscardedMetadata.WithLabelValues(validation.ReasonRateLimited, userID).Add(float64(len(validatedMetadata))) // Return a 429 here to tell the client it is going too fast. // Client may discard the data or slow down and re-send. // Prometheus v2.26 added a remote-write option 'retry_on_http_429'. - return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (%v) exceeded while adding %d samples and %d metadata", d.ingestionRateLimiter.Limit(now, userID), validatedSamples, len(validatedMetadata)) + return nil, httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(d.ingestionRateLimiter.Limit(now, userID), d.ingestionRateLimiter.Burst(now, userID), validatedSamples, validatedExemplars, len(validatedMetadata)).Error()) } // totalN included samples and metadata. Ingester follows this pattern when computing its ingestion rate. diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index dc6200cc0c9..459bcd7fa56 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -184,7 +184,7 @@ func TestDistributor_Push(t *testing.T) { happyIngesters: 3, samples: samplesIn{num: 25, startTimestampMs: 123456789000}, metadata: 5, - expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (20) exceeded while adding 25 samples and 5 metadata"), + expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(20, 20, 25, 0, 5).Error()), metricNames: []string{lastSeenTimestamp}, expectedMetrics: ` # HELP cortex_distributor_latest_seen_sample_timestamp_seconds Unix timestamp of latest received sample per user. @@ -431,7 +431,7 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { pushes: []testPush{ {expectedError: nil}, {expectedError: nil}, - {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "request rate limit (2) exceeded")}, + {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(2, 2).Error())}, }, }, "request limit is disabled when set to 0": { @@ -452,7 +452,7 @@ func TestDistributor_PushRequestRateLimiter(t *testing.T) { {expectedError: nil}, {expectedError: nil}, {expectedError: nil}, - {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "request rate limit (1) exceeded")}, + {expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewRequestRateLimitedError(1, 3).Error())}, }, }, } @@ -512,10 +512,10 @@ func TestDistributor_PushIngestionRateLimiter(t *testing.T) { pushes: []testPush{ {samples: 2, expectedError: nil}, {samples: 1, expectedError: nil}, - {samples: 2, metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 2 samples and 1 metadata")}, + {samples: 2, metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 5, 2, 0, 1).Error())}, {samples: 2, expectedError: nil}, - {samples: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 1 samples and 0 metadata")}, - {metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 0 samples and 1 metadata")}, + {samples: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 5, 1, 0, 0).Error())}, + {metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 5, 0, 0, 1).Error())}, }, }, "for each distributor, set an ingestion burst limit.": { @@ -525,10 +525,10 @@ func TestDistributor_PushIngestionRateLimiter(t *testing.T) { pushes: []testPush{ {samples: 10, expectedError: nil}, {samples: 5, expectedError: nil}, - {samples: 5, metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 5 samples and 1 metadata")}, + {samples: 5, metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 20, 5, 0, 1).Error())}, {samples: 5, expectedError: nil}, - {samples: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 1 samples and 0 metadata")}, - {metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, "ingestion rate limit (5) exceeded while adding 0 samples and 1 metadata")}, + {samples: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 20, 1, 0, 0).Error())}, + {metadata: 1, expectedError: httpgrpc.Errorf(http.StatusTooManyRequests, validation.NewIngestionRateLimitedError(5, 20, 0, 0, 1).Error())}, }, }, } diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index c67ad85d15c..2f5f8723e37 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -4,6 +4,7 @@ package globalerror import ( "fmt" + "strings" ) type ID string @@ -47,7 +48,9 @@ const ( MetricMetadataHelpTooLong ID = "help-too-long" MetricMetadataUnitTooLong ID = "unit-too-long" - MaxQueryLength ID = "max-query-length" + MaxQueryLength ID = "max-query-length" + RequestRateLimited ID = "request-rate-limited" + IngestionRateLimited ID = "ingestion-rate-limited" ) // Message returns the provided msg, appending the error id. @@ -60,3 +63,18 @@ func (id ID) Message(msg string) string { func (id ID) MessageWithLimitConfig(flag, msg string) string { return fmt.Sprintf("%s (%s%s). You can adjust the related per-tenant limit by configuring -%s, or by contacting your service administrator.", msg, errPrefix, id, flag) } + +func (id ID) MessageWithLimitConfigs(msg, flag string, addFlags ...string) string { + var sb strings.Builder + sb.WriteString("-") + sb.WriteString(flag) + if len(addFlags) > 0 { + for _, addFlag := range addFlags[:len(addFlags)-1] { + sb.WriteString(", -") + sb.WriteString(addFlag) + } + sb.WriteString(" and -") + sb.WriteString(addFlags[len(addFlags)-1]) + } + return fmt.Sprintf("%s (%s%s). You can adjust the related per-tenant limits by configuring %s, or by contacting your service administrator.", msg, errPrefix, id, sb.String()) +} diff --git a/pkg/util/validation/errors.go b/pkg/util/validation/errors.go index 1f8e2926c4b..3c27dfe54e4 100644 --- a/pkg/util/validation/errors.go +++ b/pkg/util/validation/errors.go @@ -275,6 +275,18 @@ func NewMaxQueryLengthError(actualQueryLen, maxQueryLength time.Duration) LimitE fmt.Sprintf("the query time range exceeds the limit (query length: %s, limit: %s)", actualQueryLen, maxQueryLength))) } +func NewRequestRateLimitedError(limit float64, burst int) LimitError { + return LimitError(globalerror.RequestRateLimited.MessageWithLimitConfigs( + fmt.Sprintf("request rate limit (%v) with burst (%d) exceeded", limit, burst), + requestRateFlag, requestBurstSizeFlag)) +} + +func NewIngestionRateLimitedError(limit float64, burst, numSamples, numExemplars, numMetadata int) LimitError { + return LimitError(globalerror.IngestionRateLimited.MessageWithLimitConfigs( + fmt.Sprintf("ingestion rate limit (%v) with burst (%d) exceeded while adding %d samples, %d exemplars and %d metadata", limit, burst, numSamples, numExemplars, numMetadata), + ingestionRateFlag, ingestionBurstSizeFlag)) +} + // formatLabelSet formats label adapters as a metric name with labels, while preserving // label order, and keeping duplicates. If there are multiple "__name__" labels, only // first one is used as metric name, other ones will be included as regular labels. diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 525c5c68aef..886ee6972a6 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -35,6 +35,10 @@ const ( maxMetadataLengthFlag = "validation.max-metadata-length" creationGracePeriodFlag = "validation.create-grace-period" maxQueryLengthFlag = "store.max-query-length" + requestRateFlag = "distributor.request-rate-limit" + requestBurstSizeFlag = "distributor.request-burst-size" + ingestionRateFlag = "distributor.ingestion-rate-limit" + ingestionBurstSizeFlag = "distributor.ingestion-burst-size" ) // LimitError are errors that do not comply with the limits specified. @@ -147,10 +151,10 @@ type Limits struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.IngestionTenantShardSize, "distributor.ingestion-tenant-shard-size", 0, "The tenant's shard size used by shuffle-sharding. Must be set both on ingesters and distributors. 0 disables shuffle sharding.") - f.Float64Var(&l.RequestRate, "distributor.request-rate-limit", 0, "Per-tenant request rate limit in requests per second. 0 to disable.") - f.IntVar(&l.RequestBurstSize, "distributor.request-burst-size", 0, "Per-tenant allowed request burst size. 0 to disable.") - f.Float64Var(&l.IngestionRate, "distributor.ingestion-rate-limit", 10000, "Per-tenant ingestion rate limit in samples per second.") - f.IntVar(&l.IngestionBurstSize, "distributor.ingestion-burst-size", 200000, "Per-tenant allowed ingestion burst size (in number of samples).") + f.Float64Var(&l.RequestRate, requestRateFlag, 0, "Per-tenant request rate limit in requests per second. 0 to disable.") + f.IntVar(&l.RequestBurstSize, requestBurstSizeFlag, 0, "Per-tenant allowed request burst size. 0 to disable.") + f.Float64Var(&l.IngestionRate, ingestionRateFlag, 10000, "Per-tenant ingestion rate limit in samples per second.") + f.IntVar(&l.IngestionBurstSize, ingestionBurstSizeFlag, 200000, "Per-tenant allowed ingestion burst size (in number of samples).") f.BoolVar(&l.AcceptHASamples, "distributor.ha-tracker.enable-for-all-users", false, "Flag to enable, for all tenants, handling of samples with external labels identifying replicas in an HA Prometheus setup.") f.StringVar(&l.HAClusterLabel, "distributor.ha-tracker.cluster", "cluster", "Prometheus label to look for in samples to identify a Prometheus HA cluster.") f.StringVar(&l.HAReplicaLabel, "distributor.ha-tracker.replica", "__replica__", "Prometheus label to look for in samples to identify a Prometheus HA replica.") diff --git a/pkg/util/validation/validate.go b/pkg/util/validation/validate.go index bf4655a8c17..6355b025162 100644 --- a/pkg/util/validation/validate.go +++ b/pkg/util/validation/validate.go @@ -24,10 +24,6 @@ import ( const ( discardReasonLabel = "reason" - // RateLimited is one of the values for the reason to discard samples. - // Declared here to avoid duplication in ingester and distributor. - RateLimited = "rate_limited" - // Too many HA clusters is one of the reasons for discarding samples. TooManyHAClusters = "too_many_ha_clusters" @@ -59,6 +55,10 @@ var ( reasonMetadataMetricNameTooLong = metricReasonFromErrorID(globalerror.MetricMetadataMetricNameTooLong) reasonMetadataHelpTooLong = metricReasonFromErrorID(globalerror.MetricMetadataHelpTooLong) reasonMetadataUnitTooLong = metricReasonFromErrorID(globalerror.MetricMetadataUnitTooLong) + + // ReasonRateLimited is one of the values for the reason to discard samples. + // Declared here to avoid duplication in ingester and distributor. + ReasonRateLimited = "rate_limited" // same for request and ingestion which are separate errors, so not using metricReasonFromErrorID with global error ) func metricReasonFromErrorID(id globalerror.ID) string { From b39c533817e11dea1daeb289bfb7377c7698fa6d Mon Sep 17 00:00:00 2001 From: zenador Date: Mon, 6 Jun 2022 14:29:15 +0800 Subject: [PATCH 2/7] Add validation.TooManyHAClusters to error catalogue --- docs/sources/operators-guide/mimir-runbooks/_index.md | 4 ++++ pkg/distributor/distributor.go | 6 +++--- pkg/distributor/ha_tracker.go | 6 +++++- pkg/distributor/ha_tracker_test.go | 4 ++-- pkg/util/globalerror/errors.go | 1 + pkg/util/validation/limits.go | 3 ++- pkg/util/validation/validate.go | 6 +++--- 7 files changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index 15457bd6001..db5d1792580 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -1353,6 +1353,10 @@ TBA TBA +### err-mimir-too-many-ha-clusters + +TBA + ## Mimir routes by path **Write path**: diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index b274c85c9bc..2d6a573cdb9 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -643,12 +643,12 @@ func (d *Distributor) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReq if errors.Is(err, replicasNotMatchError{}) { // These samples have been deduped. d.dedupedSamples.WithLabelValues(userID, cluster).Add(float64(numSamples)) - return nil, httpgrpc.Errorf(http.StatusAccepted, "%s", err) + return nil, httpgrpc.Errorf(http.StatusAccepted, err.Error()) } if errors.Is(err, tooManyClustersError{}) { - validation.DiscardedSamples.WithLabelValues(validation.TooManyHAClusters, userID).Add(float64(numSamples)) - return nil, httpgrpc.Errorf(http.StatusBadRequest, "%s", err) + validation.DiscardedSamples.WithLabelValues(validation.ReasonTooManyHAClusters, userID).Add(float64(numSamples)) + return nil, httpgrpc.Errorf(http.StatusBadRequest, err.Error()) } return nil, err diff --git a/pkg/distributor/ha_tracker.go b/pkg/distributor/ha_tracker.go index f7dd137e43e..cfb7575a493 100644 --- a/pkg/distributor/ha_tracker.go +++ b/pkg/distributor/ha_tracker.go @@ -27,6 +27,8 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/globalerror" + "github.com/grafana/mimir/pkg/util/validation" ) var ( @@ -528,7 +530,9 @@ type tooManyClustersError struct { } func (e tooManyClustersError) Error() string { - return fmt.Sprintf("too many HA clusters (limit: %d)", e.limit) + return globalerror.TooManyHAClusters.MessageWithLimitConfig( + validation.HATrackerMaxClustersFlag, + fmt.Sprintf("too many HA clusters (limit: %d)", e.limit)) } // Needed for errors.Is to work properly. diff --git a/pkg/distributor/ha_tracker_test.go b/pkg/distributor/ha_tracker_test.go index 8a5a2167db5..e39a278e3b9 100644 --- a/pkg/distributor/ha_tracker_test.go +++ b/pkg/distributor/ha_tracker_test.go @@ -602,7 +602,7 @@ func TestHAClustersLimit(t *testing.T) { assert.NoError(t, t1.checkReplica(context.Background(), userID, "b", "b1", now)) waitForClustersUpdate(t, 2, t1, userID) - assert.EqualError(t, t1.checkReplica(context.Background(), userID, "c", "c1", now), "too many HA clusters (limit: 2)") + assert.EqualError(t, t1.checkReplica(context.Background(), userID, "c", "c1", now), tooManyClustersError{limit: 2}.Error()) // Move time forward, and make sure that checkReplica for existing cluster works fine. now = now.Add(5 * time.Second) // higher than "update timeout" @@ -627,7 +627,7 @@ func TestHAClustersLimit(t *testing.T) { waitForClustersUpdate(t, 2, t1, userID) // But yet another cluster doesn't. - assert.EqualError(t, t1.checkReplica(context.Background(), userID, "a", "a2", now), "too many HA clusters (limit: 2)") + assert.EqualError(t, t1.checkReplica(context.Background(), userID, "a", "a2", now), tooManyClustersError{limit: 2}.Error()) now = now.Add(5 * time.Second) diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 2f5f8723e37..bdc61f45218 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -51,6 +51,7 @@ const ( MaxQueryLength ID = "max-query-length" RequestRateLimited ID = "request-rate-limited" IngestionRateLimited ID = "ingestion-rate-limited" + TooManyHAClusters ID = "too-many-ha-clusters" ) // Message returns the provided msg, appending the error id. diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 886ee6972a6..564d72165df 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -39,6 +39,7 @@ const ( requestBurstSizeFlag = "distributor.request-burst-size" ingestionRateFlag = "distributor.ingestion-rate-limit" ingestionBurstSizeFlag = "distributor.ingestion-burst-size" + HATrackerMaxClustersFlag = "distributor.ha-tracker.max-clusters" ) // LimitError are errors that do not comply with the limits specified. @@ -158,7 +159,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.AcceptHASamples, "distributor.ha-tracker.enable-for-all-users", false, "Flag to enable, for all tenants, handling of samples with external labels identifying replicas in an HA Prometheus setup.") f.StringVar(&l.HAClusterLabel, "distributor.ha-tracker.cluster", "cluster", "Prometheus label to look for in samples to identify a Prometheus HA cluster.") f.StringVar(&l.HAReplicaLabel, "distributor.ha-tracker.replica", "__replica__", "Prometheus label to look for in samples to identify a Prometheus HA replica.") - f.IntVar(&l.HAMaxClusters, "distributor.ha-tracker.max-clusters", 0, "Maximum number of clusters that HA tracker will keep track of for a single tenant. 0 to disable the limit.") + f.IntVar(&l.HAMaxClusters, HATrackerMaxClustersFlag, 0, "Maximum number of clusters that HA tracker will keep track of for a single tenant. 0 to disable the limit.") f.Var(&l.DropLabels, "distributor.drop-label", "This flag can be used to specify label names that to drop during sample ingestion within the distributor and can be repeated in order to drop multiple labels.") f.IntVar(&l.MaxLabelNameLength, maxLabelNameLengthFlag, 1024, "Maximum length accepted for label names") f.IntVar(&l.MaxLabelValueLength, maxLabelValueLengthFlag, 2048, "Maximum length accepted for label value. This setting also applies to the metric name") diff --git a/pkg/util/validation/validate.go b/pkg/util/validation/validate.go index 6355b025162..4bb915d19d8 100644 --- a/pkg/util/validation/validate.go +++ b/pkg/util/validation/validate.go @@ -24,9 +24,6 @@ import ( const ( discardReasonLabel = "reason" - // Too many HA clusters is one of the reasons for discarding samples. - TooManyHAClusters = "too_many_ha_clusters" - // The combined length of the label names and values of an Exemplar's LabelSet MUST NOT exceed 128 UTF-8 characters // https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars ExemplarMaxLabelSetLength = 128 @@ -59,6 +56,9 @@ var ( // ReasonRateLimited is one of the values for the reason to discard samples. // Declared here to avoid duplication in ingester and distributor. ReasonRateLimited = "rate_limited" // same for request and ingestion which are separate errors, so not using metricReasonFromErrorID with global error + + // ReasonTooManyHAClusters is one of the reasons for discarding samples. + ReasonTooManyHAClusters = metricReasonFromErrorID(globalerror.TooManyHAClusters) ) func metricReasonFromErrorID(id globalerror.ID) string { From 12645084d4c9833df8fb3195784b12d19ad73fd6 Mon Sep 17 00:00:00 2001 From: zenador Date: Tue, 7 Jun 2022 15:44:45 +0800 Subject: [PATCH 3/7] update docs --- .../operators-guide/mimir-runbooks/_index.md | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index db5d1792580..612564dfe1b 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -1347,15 +1347,43 @@ To configure the limit on a per-tenant basis, use the `-store.max-query-length` ### err-mimir-request-rate-limited -TBA +This error occurs when the rate of requests per second is exceeded for this tenant. + +How it **works**: + +- There is a per-tenant rate limit on the requests per second, and it's applied across all distributors for this tenant. +- The limit is implemented using [token buckets](https://en.wikipedia.org/wiki/Token_bucket). + +How to **fix** it: + +- Increase the per-tenant limit by using the `-distributor.request-rate-limit` (requests per second) and `-distributor.request-burst-size` (number of requests) options. ### err-mimir-ingestion-rate-limited -TBA +This error occurs when the rate of received samples, exemplars and metadata per second is exceeded for this tenant. + +How it **works**: + +- There is a per-tenant rate limit on the samples, exemplars and metadata that can be ingested per second, and it's applied across all distributors for this tenant. +- The limit is implemented using [token buckets](https://en.wikipedia.org/wiki/Token_bucket). + +How to **fix** it: + +- Increase the per-tenant limit by using the `-distributor.ingestion-rate-limit` (samples per second) and `-distributor.ingestion-burst-size` (number of samples) options. ### err-mimir-too-many-ha-clusters -TBA +This error occurs when a distributor rejects a write request because the number of high-availability (HA) clusters has hit the configured limit for this tenant. + +How it **works**: + +- The distributor implements an upper limit on the number of clusters that the HA tracker will keep track of for a single tenant. +- It is triggered when the write request would add a new cluster while the number the tenant currently has is already equal to the limit. +- To configure the limit, set the `-distributor.ha-tracker.max-clusters` option. + +How to **fix** it: + +- Increase the per-tenant limit by using the `-distributor.ha-tracker.max-clusters` option. ## Mimir routes by path From 07f530fcd1a03e237afc02869cc1691265678eb0 Mon Sep 17 00:00:00 2001 From: zenador Date: Tue, 7 Jun 2022 15:46:24 +0800 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Marco Pracucci --- pkg/distributor/ha_tracker.go | 2 +- pkg/util/validation/errors.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/distributor/ha_tracker.go b/pkg/distributor/ha_tracker.go index cfb7575a493..129b2dfc581 100644 --- a/pkg/distributor/ha_tracker.go +++ b/pkg/distributor/ha_tracker.go @@ -532,7 +532,7 @@ type tooManyClustersError struct { func (e tooManyClustersError) Error() string { return globalerror.TooManyHAClusters.MessageWithLimitConfig( validation.HATrackerMaxClustersFlag, - fmt.Sprintf("too many HA clusters (limit: %d)", e.limit)) + fmt.Sprintf("the write request has been rejected because the maximum number of high-availability (HA) clusters has been reached for this tenant (limit: %d)", e.limit)) } // Needed for errors.Is to work properly. diff --git a/pkg/util/validation/errors.go b/pkg/util/validation/errors.go index 3c27dfe54e4..fad63d40b89 100644 --- a/pkg/util/validation/errors.go +++ b/pkg/util/validation/errors.go @@ -277,13 +277,13 @@ func NewMaxQueryLengthError(actualQueryLen, maxQueryLength time.Duration) LimitE func NewRequestRateLimitedError(limit float64, burst int) LimitError { return LimitError(globalerror.RequestRateLimited.MessageWithLimitConfigs( - fmt.Sprintf("request rate limit (%v) with burst (%d) exceeded", limit, burst), + fmt.Sprintf("the request has been rejected because the tenant exceeded the request rate limit, set to %v req/s with a maximum allowed burst of %d", limit, burst), requestRateFlag, requestBurstSizeFlag)) } func NewIngestionRateLimitedError(limit float64, burst, numSamples, numExemplars, numMetadata int) LimitError { return LimitError(globalerror.IngestionRateLimited.MessageWithLimitConfigs( - fmt.Sprintf("ingestion rate limit (%v) with burst (%d) exceeded while adding %d samples, %d exemplars and %d metadata", limit, burst, numSamples, numExemplars, numMetadata), + fmt.Sprintf("the request has been rejected because the tenant exceeded the ingestion rate limit, set to %v items/s with a maximum allowed burst of %d, while adding %d samples, %d exemplars and %d metadata", limit, burst, numSamples, numExemplars, numMetadata), ingestionRateFlag, ingestionBurstSizeFlag)) } From 82f0b5717901e992646cf5531f919be5fe3ce2f6 Mon Sep 17 00:00:00 2001 From: zenador Date: Tue, 7 Jun 2022 15:58:20 +0800 Subject: [PATCH 5/7] improve new MessageWithLimitConfig and add tests --- pkg/util/globalerror/errors.go | 4 +++- pkg/util/globalerror/errors_test.go | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index bdc61f45218..0033974e2c8 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -69,7 +69,9 @@ func (id ID) MessageWithLimitConfigs(msg, flag string, addFlags ...string) strin var sb strings.Builder sb.WriteString("-") sb.WriteString(flag) + plural := "" if len(addFlags) > 0 { + plural = "s" for _, addFlag := range addFlags[:len(addFlags)-1] { sb.WriteString(", -") sb.WriteString(addFlag) @@ -77,5 +79,5 @@ func (id ID) MessageWithLimitConfigs(msg, flag string, addFlags ...string) strin sb.WriteString(" and -") sb.WriteString(addFlags[len(addFlags)-1]) } - return fmt.Sprintf("%s (%s%s). You can adjust the related per-tenant limits by configuring %s, or by contacting your service administrator.", msg, errPrefix, id, sb.String()) + return fmt.Sprintf("%s (%s%s). You can adjust the related per-tenant limit%s by configuring %s, or by contacting your service administrator.", msg, errPrefix, id, plural, sb.String()) } diff --git a/pkg/util/globalerror/errors_test.go b/pkg/util/globalerror/errors_test.go index bb9f750d446..53b2a8439fc 100644 --- a/pkg/util/globalerror/errors_test.go +++ b/pkg/util/globalerror/errors_test.go @@ -21,3 +21,25 @@ func TestID_MessageWithLimitConfig(t *testing.T) { "an error (err-mimir-missing-metric-name). You can adjust the related per-tenant limit by configuring -my-flag, or by contacting your service administrator.", MissingMetricName.MessageWithLimitConfig("my-flag", "an error")) } + +func TestID_MessageWithLimitConfigs(t *testing.T) { + for _, tc := range []struct { + actual string + expected string + }{ + { + actual: "an error (err-mimir-missing-metric-name). You can adjust the related per-tenant limit by configuring -my-flag1, or by contacting your service administrator.", + expected: MissingMetricName.MessageWithLimitConfigs("an error", "my-flag1"), + }, + { + actual: "an error (err-mimir-missing-metric-name). You can adjust the related per-tenant limits by configuring -my-flag1 and -my-flag2, or by contacting your service administrator.", + expected: MissingMetricName.MessageWithLimitConfigs("an error", "my-flag1", "my-flag2"), + }, + { + actual: "an error (err-mimir-missing-metric-name). You can adjust the related per-tenant limits by configuring -my-flag1, -my-flag2 and -my-flag3, or by contacting your service administrator.", + expected: MissingMetricName.MessageWithLimitConfigs("an error", "my-flag1", "my-flag2", "my-flag3"), + }, + } { + assert.Equal(t, tc.actual, tc.expected) + } +} From c2583f7531ca774872ed2efdb2b67c7baea2ff92 Mon Sep 17 00:00:00 2001 From: zenador Date: Wed, 8 Jun 2022 17:11:14 +0800 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Marco Pracucci --- .../operators-guide/mimir-runbooks/_index.md | 14 +++++++------- pkg/util/globalerror/errors.go | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index 612564dfe1b..600c198d55e 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -1347,16 +1347,16 @@ To configure the limit on a per-tenant basis, use the `-store.max-query-length` ### err-mimir-request-rate-limited -This error occurs when the rate of requests per second is exceeded for this tenant. +This error occurs when the rate of write requests per second is exceeded for this tenant. How it **works**: -- There is a per-tenant rate limit on the requests per second, and it's applied across all distributors for this tenant. +- There is a per-tenant rate limit on the write requests per second, and it's applied across all distributors for this tenant. - The limit is implemented using [token buckets](https://en.wikipedia.org/wiki/Token_bucket). How to **fix** it: -- Increase the per-tenant limit by using the `-distributor.request-rate-limit` (requests per second) and `-distributor.request-burst-size` (number of requests) options. +- Increase the per-tenant limit by using the `-distributor.request-rate-limit` (requests per second) and `-distributor.request-burst-size` (number of requests) options (or `request_rate` and `request_burst_size` in the runtime configuration). The configurable burst represents how many requests can temporarily exceed the limit, in case of short traffic peaks. The configured burst size must be greater or equal than the configured limit. ### err-mimir-ingestion-rate-limited @@ -1369,21 +1369,21 @@ How it **works**: How to **fix** it: -- Increase the per-tenant limit by using the `-distributor.ingestion-rate-limit` (samples per second) and `-distributor.ingestion-burst-size` (number of samples) options. +- Increase the per-tenant limit by using the `-distributor.ingestion-rate-limit` (samples per second) and `-distributor.ingestion-burst-size` (number of samples) options (or `ingestion_rate` and `ingestion_burst_size` in the runtime configuration). The configurable burst represents how many samples, exemplars and metadata can temporarily exceed the limit, in case of short traffic peaks. The configured burst size must be greater or equal than the configured limit. ### err-mimir-too-many-ha-clusters -This error occurs when a distributor rejects a write request because the number of high-availability (HA) clusters has hit the configured limit for this tenant. +This error occurs when a distributor rejects a write request because the number of [high-availability (HA) clusters]({{< relref "../configuring/configuring-high-availability-deduplication.md" >}}) has hit the configured limit for this tenant. How it **works**: - The distributor implements an upper limit on the number of clusters that the HA tracker will keep track of for a single tenant. - It is triggered when the write request would add a new cluster while the number the tenant currently has is already equal to the limit. -- To configure the limit, set the `-distributor.ha-tracker.max-clusters` option. +- To configure the limit, set the `-distributor.ha-tracker.max-clusters` option (or `ha_max_clusters` in the runtime configuration). How to **fix** it: -- Increase the per-tenant limit by using the `-distributor.ha-tracker.max-clusters` option. +- Increase the per-tenant limit by using the `-distributor.ha-tracker.max-clusters` option (or `ha_max_clusters` in the runtime configuration). ## Mimir routes by path diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 0033974e2c8..f43765a5c58 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -49,8 +49,8 @@ const ( MetricMetadataUnitTooLong ID = "unit-too-long" MaxQueryLength ID = "max-query-length" - RequestRateLimited ID = "request-rate-limited" - IngestionRateLimited ID = "ingestion-rate-limited" + RequestRateLimited ID = "tenant-max-request-rate" + IngestionRateLimited ID = "tenant-max-ingestion-rate" TooManyHAClusters ID = "too-many-ha-clusters" ) From fd95e66f53bbbd57bacc4f629f4aa382a3275f09 Mon Sep 17 00:00:00 2001 From: zenador Date: Wed, 8 Jun 2022 17:13:18 +0800 Subject: [PATCH 7/7] Update from changes in code review --- docs/sources/operators-guide/mimir-runbooks/_index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index 600c198d55e..99b3aac485e 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -1345,7 +1345,7 @@ Mimir has a limit on the query length. This limit is applied to partial queries, after they've split (according to time) by the query-frontend. This limit protects the system’s stability from potential abuse or mistakes. To configure the limit on a per-tenant basis, use the `-store.max-query-length` option (or `max_query_length` in the runtime configuration). -### err-mimir-request-rate-limited +### err-mimir-tenant-max-request-rate This error occurs when the rate of write requests per second is exceeded for this tenant. @@ -1358,7 +1358,7 @@ How to **fix** it: - Increase the per-tenant limit by using the `-distributor.request-rate-limit` (requests per second) and `-distributor.request-burst-size` (number of requests) options (or `request_rate` and `request_burst_size` in the runtime configuration). The configurable burst represents how many requests can temporarily exceed the limit, in case of short traffic peaks. The configured burst size must be greater or equal than the configured limit. -### err-mimir-ingestion-rate-limited +### err-mimir-tenant-max-ingestion-rate This error occurs when the rate of received samples, exemplars and metadata per second is exceeded for this tenant. @@ -1379,7 +1379,7 @@ How it **works**: - The distributor implements an upper limit on the number of clusters that the HA tracker will keep track of for a single tenant. - It is triggered when the write request would add a new cluster while the number the tenant currently has is already equal to the limit. -- To configure the limit, set the `-distributor.ha-tracker.max-clusters` option (or `ha_max_clusters` in the runtime configuration). +- To configure the limit, set the `-distributor.ha-tracker.max-clusters` option (or `ha_max_clusters` in the runtime configuration). How to **fix** it: