From 6366160124e9b1175a6d6b31ebfb5c9c44df1b49 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 25 Oct 2021 15:03:16 -0400 Subject: [PATCH 1/2] only exports tenant limits which differ from defaults and export defaults under `loki_overrides_defaults` metric. --- pkg/loki/loki.go | 2 +- pkg/loki/modules.go | 4 +- pkg/validation/exporter.go | 73 ++++++++++++++++++++++----------- pkg/validation/exporter_test.go | 9 +++- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/pkg/loki/loki.go b/pkg/loki/loki.go index 51d4d0cfd59ad..2a0612c0f0dce 100644 --- a/pkg/loki/loki.go +++ b/pkg/loki/loki.go @@ -431,7 +431,7 @@ func (t *Loki) setupModuleManager() error { deps := map[string][]string{ Ring: {RuntimeConfig, Server, MemberlistKV}, Overrides: {RuntimeConfig}, - OverridesExporter: {RuntimeConfig, Server}, + OverridesExporter: {Overrides, Server}, TenantConfigs: {RuntimeConfig}, Distributor: {Ring, Server, Overrides, TenantConfigs}, Store: {Overrides}, diff --git a/pkg/loki/modules.go b/pkg/loki/modules.go index a8b62b0c3ee01..0b93c19ad5942 100644 --- a/pkg/loki/modules.go +++ b/pkg/loki/modules.go @@ -164,12 +164,12 @@ func (t *Loki) initOverrides() (_ services.Service, err error) { } func (t *Loki) initOverridesExporter() (services.Service, error) { - if t.Cfg.isModuleEnabled(OverridesExporter) && t.TenantLimits == nil { + if t.Cfg.isModuleEnabled(OverridesExporter) && t.TenantLimits == nil || t.overrides == nil { // This target isn't enabled by default ("all") and requires per-tenant limits to run. return nil, errors.New("overrides-exporter has been enabled, but no runtime configuration file was configured") } - exporter := validation.NewOverridesExporter(t.TenantLimits) + exporter := validation.NewOverridesExporter(t.overrides) prometheus.MustRegister(exporter) // The overrides-exporter has no state and reads overrides for runtime configuration each time it diff --git a/pkg/validation/exporter.go b/pkg/validation/exporter.go index 7a7708208775a..78eb245dffbb6 100644 --- a/pkg/validation/exporter.go +++ b/pkg/validation/exporter.go @@ -11,50 +11,77 @@ import ( ) type OverridesExporter struct { - tenantLimits TenantLimits - description *prometheus.Desc + overrides *Overrides + // tenantLimits TenantLimits + tenantDesc *prometheus.Desc + defaultsDesc *prometheus.Desc } // TODO(jordanrushing): break out overrides from defaults? -func NewOverridesExporter(tenantLimits TenantLimits) *OverridesExporter { +func NewOverridesExporter(overrides *Overrides) *OverridesExporter { return &OverridesExporter{ - tenantLimits: tenantLimits, - description: prometheus.NewDesc( + overrides: overrides, + tenantDesc: prometheus.NewDesc( "loki_overrides", "Resource limit overrides applied to tenants", []string{"limit_name", "user"}, nil, ), + defaultsDesc: prometheus.NewDesc( + "loki_overrides_defaults", + "Default values for resource limit overrides applied to tenants", + []string{"limit_name"}, + nil, + ), } } func (oe *OverridesExporter) Describe(ch chan<- *prometheus.Desc) { - ch <- oe.description + ch <- oe.tenantDesc + ch <- oe.defaultsDesc } func (oe *OverridesExporter) Collect(ch chan<- prometheus.Metric) { - var metricValue float64 - var metricLabelValue string - var rv reflect.Value + extract := func(val reflect.Value, i int) (float64, bool) { + switch val.Field(i).Interface().(type) { + case int, time.Duration: + return float64(val.Field(i).Int()), true + case model.Duration: + return float64(val.Field(i).Interface().(model.Duration)), true + case flagext.ByteSize: + return float64(val.Field(i).Uint()), true + case float64: + return val.Field(i).Float(), true + default: + return 0, false + } + } + + defs := reflect.ValueOf(oe.overrides.DefaultLimits()).Elem() + + for i := 0; i < defs.NumField(); i++ { + if v, ok := extract(defs, i); ok { + metricLabelValue := defs.Type().Field(i).Tag.Get("yaml") + ch <- prometheus.MustNewConstMetric(oe.defaultsDesc, prometheus.GaugeValue, v, metricLabelValue) + } - for tenant, limits := range oe.tenantLimits.AllByUserID() { - rv = reflect.ValueOf(limits).Elem() + } + + for tenant, limits := range oe.overrides.AllByUserID() { + rv := reflect.ValueOf(limits).Elem() for i := 0; i < rv.NumField(); i++ { - switch rv.Field(i).Interface().(type) { - case int, time.Duration: - metricValue = float64(rv.Field(i).Int()) - case model.Duration: - metricValue = float64(rv.Field(i).Interface().(model.Duration)) - case flagext.ByteSize: - metricValue = float64(rv.Field(i).Uint()) - case float64: - metricValue = rv.Field(i).Float() - default: + + v, ok := extract(rv, i) + + // Only report fields which are explicitly overridden + if !ok || rv.Field(i).Interface() == defs.Field(i).Interface() { continue + } - metricLabelValue = rv.Type().Field(i).Tag.Get("yaml") - ch <- prometheus.MustNewConstMetric(oe.description, prometheus.GaugeValue, metricValue, metricLabelValue, tenant) + metricLabelValue := rv.Type().Field(i).Tag.Get("yaml") + ch <- prometheus.MustNewConstMetric(oe.tenantDesc, prometheus.GaugeValue, v, metricLabelValue, tenant) } } + } diff --git a/pkg/validation/exporter_test.go b/pkg/validation/exporter_test.go index 436797c7afb09..a7a51327599c1 100644 --- a/pkg/validation/exporter_test.go +++ b/pkg/validation/exporter_test.go @@ -5,6 +5,7 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) type mockTenantLimits struct { @@ -24,9 +25,11 @@ func (l *mockTenantLimits) TenantLimits(userID string) *Limits { func (l *mockTenantLimits) AllByUserID() map[string]*Limits { return l.limits } func TestOverridesExporter_noConfig(t *testing.T) { - exporter := NewOverridesExporter(newMockTenantLimits(nil)) + overrides, _ := NewOverrides(Limits{}, newMockTenantLimits(nil)) + exporter := NewOverridesExporter(overrides) count := testutil.CollectAndCount(exporter, "loki_overrides") assert.Equal(t, 0, count) + require.Greater(t, testutil.CollectAndCount(exporter, "loki_overrides_defaults"), 0) } func TestOverridesExporter_withConfig(t *testing.T) { @@ -35,7 +38,9 @@ func TestOverridesExporter_withConfig(t *testing.T) { MaxQueriersPerTenant: 5, }, } - exporter := NewOverridesExporter(newMockTenantLimits(tenantLimits)) + overrides, _ := NewOverrides(Limits{}, newMockTenantLimits(tenantLimits)) + exporter := NewOverridesExporter(overrides) count := testutil.CollectAndCount(exporter, "loki_overrides") assert.Greater(t, count, 0) + require.Greater(t, testutil.CollectAndCount(exporter, "loki_overrides_defaults"), 0) } From 5b4847ae12b0cd640fbf067f0151b1afa58a9c35 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 25 Oct 2021 15:14:41 -0400 Subject: [PATCH 2/2] docs & changelog --- CHANGELOG.md | 1 + docs/sources/operations/overrides-exporter.md | 89 ++++++++++--------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db2ae8c9b0dcb..a76d464b42e99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * [4425](https://github.com/grafana/loki/pull/4425) **trevorwhitney** and **slim-bean**: Add a ring for the query scheduler * [4519](https://github.com/grafana/loki/pull/4519) **DylanGuedes** and **replay**: Loki: Enable FIFO cache by default * [4520](https://github.com/grafana/loki/pull/4520) **jordanrushing** and **owen-d**: Introduce overrides-exporter module for tenant limits +* [4542](https://github.com/grafana/loki/pull/4542) **owen-d**: Introduce the `loki_overrides_defaults` metric and only export diffs for tenant limits. # 2.3.0 (2021/08/06) diff --git a/docs/sources/operations/overrides-exporter.md b/docs/sources/operations/overrides-exporter.md index 2dba66507d60b..f3323731837f5 100644 --- a/docs/sources/operations/overrides-exporter.md +++ b/docs/sources/operations/overrides-exporter.md @@ -11,7 +11,7 @@ Configuration updates to tenant limits can be applied to Loki without restart vi ## Example -The `overrides-exporter` module is disabled by default. We recommend running a single instance per cluster to avoid issues with metric cardinality as the `overrides-exporter` creates ~40 metrics per tenant with overrides configured. +The `overrides-exporter` module is disabled by default. We recommend running a single instance per cluster to avoid issues with metric cardinality. The `overrides-exporter` creates one metric for every scalar field in the limits configuration under the metric `loki_overrides_defaults` with the default value for that field after loading the Loki configuration. It also exposes another metric for _every_ differing field for _every_ tenant. Using an example `runtime.yaml`: @@ -32,47 +32,52 @@ loki -target=overrides-exporter -runtime-config.file=runtime.yaml -config.file=b To inspect the tenant limit overrides: ```shell -$ curl -sq localhost:8080/metrics | grep override -# HELP loki_overrides Resource limit overrides applied to tenants -# TYPE loki_overrides gauge -loki_overrides{limit_name="cardinality_limit",user="user1"} 100000 -loki_overrides{limit_name="creation_grace_period",user="user1"} 6e+11 -loki_overrides{limit_name="ingestion_burst_size_mb",user="user1"} 350000 -loki_overrides{limit_name="ingestion_rate_mb",user="user1"} 10 -loki_overrides{limit_name="max_cache_freshness_per_query",user="user1"} 6e+10 -loki_overrides{limit_name="max_chunks_per_query",user="user1"} 100000 -loki_overrides{limit_name="max_concurrent_tail_requests",user="user1"} 10 -loki_overrides{limit_name="max_entries_limit_per_query",user="user1"} 5000 -loki_overrides{limit_name="max_global_streams_per_user",user="user1"} 5000 -loki_overrides{limit_name="max_label_name_length",user="user1"} 1024 -loki_overrides{limit_name="max_label_names_per_series",user="user1"} 30 -loki_overrides{limit_name="max_label_value_length",user="user1"} 2048 -loki_overrides{limit_name="max_line_size",user="user1"} 0 -loki_overrides{limit_name="max_queriers_per_tenant",user="user1"} 0 -loki_overrides{limit_name="max_query_length",user="user1"} 2.5956e+15 -loki_overrides{limit_name="max_query_lookback",user="user1"} 0 -loki_overrides{limit_name="max_query_parallelism",user="user1"} 32 -loki_overrides{limit_name="max_query_series",user="user1"} 1000 -loki_overrides{limit_name="max_streams_matchers_per_query",user="user1"} 1000 -loki_overrides{limit_name="max_streams_per_user",user="user1"} 100000 -loki_overrides{limit_name="min_sharding_lookback",user="user1"} 0 -loki_overrides{limit_name="per_stream_rate_limit",user="user1"} 3.145728e+06 -loki_overrides{limit_name="per_stream_rate_limit_burst",user="user1"} 1.572864e+07 -loki_overrides{limit_name="per_tenant_override_period",user="user1"} 1e+10 -loki_overrides{limit_name="reject_old_samples_max_age",user="user1"} 1.2096e+15 -loki_overrides{limit_name="retention_period",user="user1"} 2.6784e+15 -loki_overrides{limit_name="ruler_evaluation_delay_duration",user="user1"} 0 -loki_overrides{limit_name="ruler_max_rule_groups_per_tenant",user="user1"} 0 -loki_overrides{limit_name="ruler_max_rules_per_rule_group",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_batch_send_deadline",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_capacity",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_max_backoff",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_max_samples_per_send",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_max_shards",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_min_backoff",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_queue_min_shards",user="user1"} 0 -loki_overrides{limit_name="ruler_remote_write_timeout",user="user1"} 0 -loki_overrides{limit_name="split_queries_by_interval",user="user1"} 0 +$ curl -sq localhost:8080/metrics | grep override +# HELP loki_overrides Resource limit overrides applied to tenants +# TYPE loki_overrides gauge +loki_overrides{limit_name="ingestion_rate_mb",user="tenant_1"} 10 +loki_overrides{limit_name="max_chunks_per_query",user="tenant_1"} 100000 +loki_overrides{limit_name="max_streams_per_user",user="tenant_1"} 100000 +# HELP loki_overrides_defaults Default values for resource limit overrides applied to tenants +# TYPE loki_overrides_defaults gauge +loki_overrides_defaults{limit_name="cardinality_limit"} 100000 +loki_overrides_defaults{limit_name="creation_grace_period"} 6e+11 +loki_overrides_defaults{limit_name="ingestion_burst_size_mb"} 6 +loki_overrides_defaults{limit_name="ingestion_rate_mb"} 4 +loki_overrides_defaults{limit_name="max_cache_freshness_per_query"} 6e+10 +loki_overrides_defaults{limit_name="max_chunks_per_query"} 2e+06 +loki_overrides_defaults{limit_name="max_concurrent_tail_requests"} 10 +loki_overrides_defaults{limit_name="max_entries_limit_per_query"} 5000 +loki_overrides_defaults{limit_name="max_global_streams_per_user"} 5000 +loki_overrides_defaults{limit_name="max_label_name_length"} 1024 +loki_overrides_defaults{limit_name="max_label_names_per_series"} 30 +loki_overrides_defaults{limit_name="max_label_value_length"} 2048 +loki_overrides_defaults{limit_name="max_line_size"} 0 +loki_overrides_defaults{limit_name="max_queriers_per_tenant"} 0 +loki_overrides_defaults{limit_name="max_query_length"} 2.5956e+15 +loki_overrides_defaults{limit_name="max_query_lookback"} 0 +loki_overrides_defaults{limit_name="max_query_parallelism"} 32 +loki_overrides_defaults{limit_name="max_query_series"} 500 +loki_overrides_defaults{limit_name="max_streams_matchers_per_query"} 1000 +loki_overrides_defaults{limit_name="max_streams_per_user"} 0 +loki_overrides_defaults{limit_name="min_sharding_lookback"} 0 +loki_overrides_defaults{limit_name="per_stream_rate_limit"} 3.145728e+06 +loki_overrides_defaults{limit_name="per_stream_rate_limit_burst"} 1.572864e+07 +loki_overrides_defaults{limit_name="per_tenant_override_period"} 1e+10 +loki_overrides_defaults{limit_name="reject_old_samples_max_age"} 1.2096e+15 +loki_overrides_defaults{limit_name="retention_period"} 2.6784e+15 +loki_overrides_defaults{limit_name="ruler_evaluation_delay_duration"} 0 +loki_overrides_defaults{limit_name="ruler_max_rule_groups_per_tenant"} 0 +loki_overrides_defaults{limit_name="ruler_max_rules_per_rule_group"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_batch_send_deadline"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_capacity"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_backoff"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_samples_per_send"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_shards"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_min_backoff"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_queue_min_shards"} 0 +loki_overrides_defaults{limit_name="ruler_remote_write_timeout"} 0 +loki_overrides_defaults{limit_name="split_queries_by_interval"} 0 ``` Alerts can be created based on these metrics to inform operators when tenants are close to hitting their limits allowing for increases to be applied before the tenant limits are exceeded.