Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only exports tenant limits which differ from defaults and export defa… #4542

Merged
merged 3 commits into from
Oct 26, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* [4425](https://github.com/grafana/loki/pull/4425) **trevorwhitney** and **slim-bean**: Add a ring for the query scheduler
* [4519](https://github.com/grafana/loki/pull/4519) **DylanGuedes** and **replay**: Loki: Enable FIFO cache by default
* [4520](https://github.com/grafana/loki/pull/4520) **jordanrushing** and **owen-d**: Introduce overrides-exporter module for tenant limits
* [4542](https://github.com/grafana/loki/pull/4542) **owen-d**: Introduce the `loki_overrides_defaults` metric and only export diffs for tenant limits.

# 2.3.0 (2021/08/06)

Expand Down
89 changes: 47 additions & 42 deletions docs/sources/operations/overrides-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Configuration updates to tenant limits can be applied to Loki without restart vi

## Example

The `overrides-exporter` module is disabled by default. We recommend running a single instance per cluster to avoid issues with metric cardinality as the `overrides-exporter` creates ~40 metrics per tenant with overrides configured.
The `overrides-exporter` module is disabled by default. We recommend running a single instance per cluster to avoid issues with metric cardinality. The `overrides-exporter` creates one metric for every scalar field in the limits configuration under the metric `loki_overrides_defaults` with the default value for that field after loading the Loki configuration. It also exposes another metric for _every_ differing field for _every_ tenant.

Using an example `runtime.yaml`:

Expand All @@ -32,47 +32,52 @@ loki -target=overrides-exporter -runtime-config.file=runtime.yaml -config.file=b
To inspect the tenant limit overrides:

```shell
$ curl -sq localhost:8080/metrics | grep override
# HELP loki_overrides Resource limit overrides applied to tenants
# TYPE loki_overrides gauge
loki_overrides{limit_name="cardinality_limit",user="user1"} 100000
loki_overrides{limit_name="creation_grace_period",user="user1"} 6e+11
loki_overrides{limit_name="ingestion_burst_size_mb",user="user1"} 350000
loki_overrides{limit_name="ingestion_rate_mb",user="user1"} 10
loki_overrides{limit_name="max_cache_freshness_per_query",user="user1"} 6e+10
loki_overrides{limit_name="max_chunks_per_query",user="user1"} 100000
loki_overrides{limit_name="max_concurrent_tail_requests",user="user1"} 10
loki_overrides{limit_name="max_entries_limit_per_query",user="user1"} 5000
loki_overrides{limit_name="max_global_streams_per_user",user="user1"} 5000
loki_overrides{limit_name="max_label_name_length",user="user1"} 1024
loki_overrides{limit_name="max_label_names_per_series",user="user1"} 30
loki_overrides{limit_name="max_label_value_length",user="user1"} 2048
loki_overrides{limit_name="max_line_size",user="user1"} 0
loki_overrides{limit_name="max_queriers_per_tenant",user="user1"} 0
loki_overrides{limit_name="max_query_length",user="user1"} 2.5956e+15
loki_overrides{limit_name="max_query_lookback",user="user1"} 0
loki_overrides{limit_name="max_query_parallelism",user="user1"} 32
loki_overrides{limit_name="max_query_series",user="user1"} 1000
loki_overrides{limit_name="max_streams_matchers_per_query",user="user1"} 1000
loki_overrides{limit_name="max_streams_per_user",user="user1"} 100000
loki_overrides{limit_name="min_sharding_lookback",user="user1"} 0
loki_overrides{limit_name="per_stream_rate_limit",user="user1"} 3.145728e+06
loki_overrides{limit_name="per_stream_rate_limit_burst",user="user1"} 1.572864e+07
loki_overrides{limit_name="per_tenant_override_period",user="user1"} 1e+10
loki_overrides{limit_name="reject_old_samples_max_age",user="user1"} 1.2096e+15
loki_overrides{limit_name="retention_period",user="user1"} 2.6784e+15
loki_overrides{limit_name="ruler_evaluation_delay_duration",user="user1"} 0
loki_overrides{limit_name="ruler_max_rule_groups_per_tenant",user="user1"} 0
loki_overrides{limit_name="ruler_max_rules_per_rule_group",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_batch_send_deadline",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_capacity",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_max_backoff",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_max_samples_per_send",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_max_shards",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_min_backoff",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_queue_min_shards",user="user1"} 0
loki_overrides{limit_name="ruler_remote_write_timeout",user="user1"} 0
loki_overrides{limit_name="split_queries_by_interval",user="user1"} 0
$ curl -sq localhost:8080/metrics | grep override
# HELP loki_overrides Resource limit overrides applied to tenants
# TYPE loki_overrides gauge
loki_overrides{limit_name="ingestion_rate_mb",user="tenant_1"} 10
loki_overrides{limit_name="max_chunks_per_query",user="tenant_1"} 100000
loki_overrides{limit_name="max_streams_per_user",user="tenant_1"} 100000
# HELP loki_overrides_defaults Default values for resource limit overrides applied to tenants
# TYPE loki_overrides_defaults gauge
loki_overrides_defaults{limit_name="cardinality_limit"} 100000
loki_overrides_defaults{limit_name="creation_grace_period"} 6e+11
loki_overrides_defaults{limit_name="ingestion_burst_size_mb"} 6
loki_overrides_defaults{limit_name="ingestion_rate_mb"} 4
loki_overrides_defaults{limit_name="max_cache_freshness_per_query"} 6e+10
loki_overrides_defaults{limit_name="max_chunks_per_query"} 2e+06
loki_overrides_defaults{limit_name="max_concurrent_tail_requests"} 10
loki_overrides_defaults{limit_name="max_entries_limit_per_query"} 5000
loki_overrides_defaults{limit_name="max_global_streams_per_user"} 5000
loki_overrides_defaults{limit_name="max_label_name_length"} 1024
loki_overrides_defaults{limit_name="max_label_names_per_series"} 30
loki_overrides_defaults{limit_name="max_label_value_length"} 2048
loki_overrides_defaults{limit_name="max_line_size"} 0
loki_overrides_defaults{limit_name="max_queriers_per_tenant"} 0
loki_overrides_defaults{limit_name="max_query_length"} 2.5956e+15
loki_overrides_defaults{limit_name="max_query_lookback"} 0
loki_overrides_defaults{limit_name="max_query_parallelism"} 32
loki_overrides_defaults{limit_name="max_query_series"} 500
loki_overrides_defaults{limit_name="max_streams_matchers_per_query"} 1000
loki_overrides_defaults{limit_name="max_streams_per_user"} 0
loki_overrides_defaults{limit_name="min_sharding_lookback"} 0
loki_overrides_defaults{limit_name="per_stream_rate_limit"} 3.145728e+06
loki_overrides_defaults{limit_name="per_stream_rate_limit_burst"} 1.572864e+07
loki_overrides_defaults{limit_name="per_tenant_override_period"} 1e+10
loki_overrides_defaults{limit_name="reject_old_samples_max_age"} 1.2096e+15
loki_overrides_defaults{limit_name="retention_period"} 2.6784e+15
loki_overrides_defaults{limit_name="ruler_evaluation_delay_duration"} 0
loki_overrides_defaults{limit_name="ruler_max_rule_groups_per_tenant"} 0
loki_overrides_defaults{limit_name="ruler_max_rules_per_rule_group"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_batch_send_deadline"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_capacity"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_backoff"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_samples_per_send"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_max_shards"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_min_backoff"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_queue_min_shards"} 0
loki_overrides_defaults{limit_name="ruler_remote_write_timeout"} 0
loki_overrides_defaults{limit_name="split_queries_by_interval"} 0
```

Alerts can be created based on these metrics to inform operators when tenants are close to hitting their limits allowing for increases to be applied before the tenant limits are exceeded.
2 changes: 1 addition & 1 deletion pkg/loki/loki.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ func (t *Loki) setupModuleManager() error {
deps := map[string][]string{
Ring: {RuntimeConfig, Server, MemberlistKV},
Overrides: {RuntimeConfig},
OverridesExporter: {RuntimeConfig, Server},
OverridesExporter: {Overrides, Server},
TenantConfigs: {RuntimeConfig},
Distributor: {Ring, Server, Overrides, TenantConfigs},
Store: {Overrides},
Expand Down
4 changes: 2 additions & 2 deletions pkg/loki/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,12 @@ func (t *Loki) initOverrides() (_ services.Service, err error) {
}

func (t *Loki) initOverridesExporter() (services.Service, error) {
if t.Cfg.isModuleEnabled(OverridesExporter) && t.TenantLimits == nil {
if t.Cfg.isModuleEnabled(OverridesExporter) && t.TenantLimits == nil || t.overrides == nil {
// This target isn't enabled by default ("all") and requires per-tenant limits to run.
return nil, errors.New("overrides-exporter has been enabled, but no runtime configuration file was configured")
}

exporter := validation.NewOverridesExporter(t.TenantLimits)
exporter := validation.NewOverridesExporter(t.overrides)
prometheus.MustRegister(exporter)

// The overrides-exporter has no state and reads overrides for runtime configuration each time it
Expand Down
73 changes: 50 additions & 23 deletions pkg/validation/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,50 +11,77 @@ import (
)

type OverridesExporter struct {
tenantLimits TenantLimits
description *prometheus.Desc
overrides *Overrides
// tenantLimits TenantLimits
tenantDesc *prometheus.Desc
defaultsDesc *prometheus.Desc
}

// TODO(jordanrushing): break out overrides from defaults?
func NewOverridesExporter(tenantLimits TenantLimits) *OverridesExporter {
func NewOverridesExporter(overrides *Overrides) *OverridesExporter {
return &OverridesExporter{
tenantLimits: tenantLimits,
description: prometheus.NewDesc(
overrides: overrides,
tenantDesc: prometheus.NewDesc(
"loki_overrides",
"Resource limit overrides applied to tenants",
[]string{"limit_name", "user"},
nil,
),
defaultsDesc: prometheus.NewDesc(
"loki_overrides_defaults",
"Default values for resource limit overrides applied to tenants",
[]string{"limit_name"},
nil,
),
}
}

func (oe *OverridesExporter) Describe(ch chan<- *prometheus.Desc) {
ch <- oe.description
ch <- oe.tenantDesc
ch <- oe.defaultsDesc
}

func (oe *OverridesExporter) Collect(ch chan<- prometheus.Metric) {
var metricValue float64
var metricLabelValue string
var rv reflect.Value
extract := func(val reflect.Value, i int) (float64, bool) {
switch val.Field(i).Interface().(type) {
case int, time.Duration:
return float64(val.Field(i).Int()), true
case model.Duration:
return float64(val.Field(i).Interface().(model.Duration)), true
case flagext.ByteSize:
return float64(val.Field(i).Uint()), true
case float64:
return val.Field(i).Float(), true
default:
return 0, false
}
}

defs := reflect.ValueOf(oe.overrides.DefaultLimits()).Elem()

for i := 0; i < defs.NumField(); i++ {
if v, ok := extract(defs, i); ok {
metricLabelValue := defs.Type().Field(i).Tag.Get("yaml")
ch <- prometheus.MustNewConstMetric(oe.defaultsDesc, prometheus.GaugeValue, v, metricLabelValue)
}

for tenant, limits := range oe.tenantLimits.AllByUserID() {
rv = reflect.ValueOf(limits).Elem()
}

for tenant, limits := range oe.overrides.AllByUserID() {
rv := reflect.ValueOf(limits).Elem()
for i := 0; i < rv.NumField(); i++ {
switch rv.Field(i).Interface().(type) {
case int, time.Duration:
metricValue = float64(rv.Field(i).Int())
case model.Duration:
metricValue = float64(rv.Field(i).Interface().(model.Duration))
case flagext.ByteSize:
metricValue = float64(rv.Field(i).Uint())
case float64:
metricValue = rv.Field(i).Float()
default:

v, ok := extract(rv, i)

// Only report fields which are explicitly overridden
if !ok || rv.Field(i).Interface() == defs.Field(i).Interface() {
continue

}
metricLabelValue = rv.Type().Field(i).Tag.Get("yaml")

ch <- prometheus.MustNewConstMetric(oe.description, prometheus.GaugeValue, metricValue, metricLabelValue, tenant)
metricLabelValue := rv.Type().Field(i).Tag.Get("yaml")
ch <- prometheus.MustNewConstMetric(oe.tenantDesc, prometheus.GaugeValue, v, metricLabelValue, tenant)
}
}

}
Loading