From 7c7f9c9c323a3b0dd4ddbab8729510936aac7c9d Mon Sep 17 00:00:00 2001 From: James Rasell Date: Wed, 20 Mar 2024 08:49:11 +0000 Subject: [PATCH 1/4] agent: allow configuration of in-memory telemetry sink. This change adds configuration options for setting the in-memory telemetry sink collection and retention durations. This sink backs the metrics JSON API and previously had hard-coded default values. The new options are particularly useful when running development or debug environments, where metrics collection is desired at a fast and granular rate. --- command/agent/command.go | 29 +++++++++++---- command/agent/config.go | 60 +++++++++++++++++++++++++++--- command/agent/config_parse.go | 2 + command/agent/config_parse_test.go | 22 ++++++----- command/agent/config_test.go | 44 ++++++++++++++++++++++ command/agent/testdata/basic.hcl | 16 ++++---- command/agent/testdata/basic.json | 14 ++++--- 7 files changed, 153 insertions(+), 34 deletions(-) diff --git a/command/agent/command.go b/command/agent/command.go index 8cc58f2c3ef..f977c9ad451 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -351,6 +351,11 @@ func (c *Command) IsValidConfig(config, cmdConfig *Config) bool { return false } + if err := config.Telemetry.Validate(); err != nil { + c.Ui.Error(fmt.Sprintf("telemetry block invalid: %v", err)) + return false + } + // Set up the TLS configuration properly if we have one. // XXX chelseakomlo: set up a TLSConfig New method which would wrap // constructor-type actions like this. @@ -1149,14 +1154,8 @@ func (c *Command) handleReload() { } } -// setupTelemetry is used ot setup the telemetry sub-systems +// setupTelemetry is used to set up the telemetry sub-systems. func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { - /* Setup telemetry - Aggregate on 10 second intervals for 1 minute. Expose the - metrics over stderr when there is a SIGUSR1 received. - */ - inm := metrics.NewInmemSink(10*time.Second, time.Minute) - metrics.DefaultInmemSignal(inm) var telConfig *Telemetry if config.Telemetry == nil { @@ -1165,6 +1164,22 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { telConfig = config.Telemetry } + // The values are defaulted in both the dev and default config object, so + // we should never have empty values. Do this, so it can never happen and + // cause agents to panic on start. + inMemInt := 10 * time.Second + inMemRet := 1 * time.Minute + + if telConfig.inMemoryCollectionInterval > 0 { + inMemInt = telConfig.inMemoryCollectionInterval + } + if telConfig.inMemoryRetentionPeriod > 0 { + inMemRet = telConfig.inMemoryRetentionPeriod + } + + inm := metrics.NewInmemSink(inMemInt, inMemRet) + metrics.DefaultInmemSignal(inm) + metricsConf := metrics.DefaultConfig("nomad") metricsConf.EnableHostname = !telConfig.DisableHostname diff --git a/command/agent/config.go b/command/agent/config.go index 34e58b4ceeb..00c527bd589 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -935,6 +935,20 @@ func (s *ServerConfig) EncryptBytes() ([]byte, error) { // Telemetry is the telemetry configuration for the server type Telemetry struct { + + // InMemoryCollectionInterval configures the in-memory sink collection + // interval. This sink is always configured and backs the JSON metrics API + // endpoint. This option is particularly useful for debugging or + // development. + InMemoryCollectionInterval string `hcl:"in_memory_collection_interval"` + inMemoryCollectionInterval time.Duration `hcl:"-"` + + // InMemoryRetentionPeriod configures the in-memory sink retention period + // This sink is always configured and backs the JSON metrics API endpoint. + // This option is particularly useful for debugging or development. + InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"` + inMemoryRetentionPeriod time.Duration `hcl:"-"` + StatsiteAddr string `hcl:"statsite_address"` StatsdAddr string `hcl:"statsd_address"` DataDogAddr string `hcl:"datadog_address"` @@ -1055,8 +1069,8 @@ func (t *Telemetry) Copy() *Telemetry { } // PrefixFilters parses the PrefixFilter field and returns a list of allowed and blocked filters -func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { - for _, rule := range a.PrefixFilter { +func (t *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { + for _, rule := range t.PrefixFilter { if rule == "" { continue } @@ -1072,6 +1086,24 @@ func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { return allowed, blocked, nil } +// Validate the telemetry configuration options. These are used by the agent, +// regardless of mode, so can live here rather than a structs package. It is +// safe to call, without checking whether the config object is nil first. +func (t *Telemetry) Validate() error { + if t == nil { + return nil + } + + // Ensure the in-memory durations are set correctly. + if t.inMemoryRetentionPeriod > 0 && t.inMemoryCollectionInterval > 0 { + if t.inMemoryCollectionInterval > t.inMemoryRetentionPeriod { + return errors.New("telemetry in-memory collection interval cannot be greater than retention period") + } + } + + return nil +} + // Ports encapsulates the various ports we bind to for network services. If any // are not specified then the defaults are used instead. type Ports struct { @@ -1390,8 +1422,12 @@ func DefaultConfig() *Config { }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ - CollectionInterval: "1s", - collectionInterval: 1 * time.Second, + InMemoryCollectionInterval: "10s", + inMemoryCollectionInterval: 10 * time.Second, + InMemoryRetentionPeriod: "1m", + inMemoryRetentionPeriod: 1 * time.Minute, + CollectionInterval: "1s", + collectionInterval: 1 * time.Second, }, TLSConfig: &config.TLSConfig{}, Sentinel: &config.SentinelConfig{}, @@ -2366,9 +2402,21 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { } // Merge is used to merge two telemetry configs together -func (a *Telemetry) Merge(b *Telemetry) *Telemetry { - result := *a +func (t *Telemetry) Merge(b *Telemetry) *Telemetry { + result := *t + if b.InMemoryCollectionInterval != "" { + result.InMemoryCollectionInterval = b.InMemoryCollectionInterval + } + if b.inMemoryCollectionInterval != 0 { + result.inMemoryCollectionInterval = b.inMemoryCollectionInterval + } + if b.InMemoryRetentionPeriod != "" { + result.InMemoryRetentionPeriod = b.InMemoryRetentionPeriod + } + if b.inMemoryRetentionPeriod != 0 { + result.inMemoryRetentionPeriod = b.inMemoryRetentionPeriod + } if b.StatsiteAddr != "" { result.StatsiteAddr = b.StatsiteAddr } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 859db5db53e..0ca36170446 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -109,6 +109,8 @@ func ParseConfigFile(path string) (*Config, error) { {"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil}, {"autopilot.server_stabilization_time", &c.Autopilot.ServerStabilizationTime, &c.Autopilot.ServerStabilizationTimeHCL, nil}, {"autopilot.last_contact_threshold", &c.Autopilot.LastContactThreshold, &c.Autopilot.LastContactThresholdHCL, nil}, + {"telemetry.in_memory_collection_interval", &c.Telemetry.inMemoryCollectionInterval, &c.Telemetry.InMemoryCollectionInterval, nil}, + {"telemetry.in_memory_retention_period", &c.Telemetry.inMemoryRetentionPeriod, &c.Telemetry.InMemoryRetentionPeriod, nil}, {"telemetry.collection_interval", &c.Telemetry.collectionInterval, &c.Telemetry.CollectionInterval, nil}, {"client.template.block_query_wait", nil, &c.Client.TemplateConfig.BlockQueryWaitTimeHCL, func(d *time.Duration) { diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 6231125b90c..99da9adaf42 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -196,15 +196,19 @@ var basicConfig = &Config{ }, }, Telemetry: &Telemetry{ - StatsiteAddr: "127.0.0.1:1234", - StatsdAddr: "127.0.0.1:2345", - PrometheusMetrics: true, - DisableHostname: true, - UseNodeName: false, - CollectionInterval: "3s", - collectionInterval: 3 * time.Second, - PublishAllocationMetrics: true, - PublishNodeMetrics: true, + StatsiteAddr: "127.0.0.1:1234", + StatsdAddr: "127.0.0.1:2345", + PrometheusMetrics: true, + DisableHostname: true, + UseNodeName: false, + InMemoryCollectionInterval: "1m", + inMemoryCollectionInterval: 1 * time.Minute, + InMemoryRetentionPeriod: "24h", + inMemoryRetentionPeriod: 24 * time.Hour, + CollectionInterval: "3s", + collectionInterval: 3 * time.Second, + PublishAllocationMetrics: true, + PublishNodeMetrics: true, }, LeaveOnInt: true, LeaveOnTerm: true, diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 43c3189aabd..f98cbedcf28 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -4,6 +4,7 @@ package agent import ( + "errors" "fmt" "net" "os" @@ -1376,6 +1377,49 @@ func TestTelemetry_PrefixFilters(t *testing.T) { } } +func TestTelemetry_Validate(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + inputTelemetry *Telemetry + expectedError error + }{ + { + name: "nil", + inputTelemetry: nil, + expectedError: nil, + }, + { + name: "invalid", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 10 * time.Second, + inMemoryRetentionPeriod: 1 * time.Second, + }, + expectedError: errors.New("telemetry in-memory collection interval cannot be greater than retention period"), + }, + { + name: "valid", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + }, + expectedError: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actualError := tc.inputTelemetry.Validate() + if tc.expectedError != nil { + must.EqError(t, actualError, tc.expectedError.Error()) + } else { + must.NoError(t, actualError) + } + }) + } +} + func TestTelemetry_Parse(t *testing.T) { ci.Parallel(t) diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index d3a27c64b2c..f9d88d0a379 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -199,13 +199,15 @@ audit { } telemetry { - statsite_address = "127.0.0.1:1234" - statsd_address = "127.0.0.1:2345" - prometheus_metrics = true - disable_hostname = true - collection_interval = "3s" - publish_allocation_metrics = true - publish_node_metrics = true + in_memory_collection_interval = "1m" + in_memory_retention_period = "24h" + statsite_address = "127.0.0.1:1234" + statsd_address = "127.0.0.1:2345" + prometheus_metrics = true + disable_hostname = true + collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true } leave_on_interrupt = true diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index 14f07eea029..4804e6a9c79 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -169,7 +169,6 @@ "server_service_name": "nomad", "service_auth_method": "nomad-services", "task_auth_method": "nomad-tasks", - "service_identity": { "aud": [ "consul.io", @@ -360,6 +359,8 @@ "syslog_facility": "LOCAL1", "telemetry": [ { + "in_memory_collection_interval": "1m", + "in_memory_retention_period": "24h", "collection_interval": "3s", "disable_hostname": true, "prometheus_metrics": true, @@ -393,7 +394,10 @@ "cert_file": "/path/to/cert/file", "create_from_role": "test_role", "default_identity": { - "aud": ["vault.io", "nomad.io"], + "aud": [ + "vault.io", + "nomad.io" + ], "env": false, "file": true, "ttl": "3h" @@ -407,9 +411,9 @@ "token": "12345" } ], - "reporting":{ - "license":{ - "enabled":"true" + "reporting": { + "license": { + "enabled": "true" } } } From a94d996cd8542922b1a9d0636e41d52f5da078d4 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Thu, 21 Mar 2024 09:02:29 +0000 Subject: [PATCH 2/4] agent: add additional config test for telemetry --- command/agent/config_parse_test.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 99da9adaf42..3854eed378d 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -1083,3 +1083,23 @@ func TestConfig_MultipleConsul(t *testing.T) { }) } } + +func TestConfig_Telemetry(t *testing.T) { + ci.Parallel(t) + + // Ensure merging a mostly empty struct correctly inherits default values + // set. + inputTelemetry1 := &Telemetry{PrometheusMetrics: true} + mergedTelemetry1 := DefaultConfig().Telemetry.Merge(inputTelemetry1) + must.Eq(t, mergedTelemetry1.inMemoryCollectionInterval, 10*time.Second) + must.Eq(t, mergedTelemetry1.inMemoryRetentionPeriod, 1*time.Minute) + + // Ensure we can then overlay user specified data. + inputTelemetry2 := &Telemetry{ + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + } + mergedTelemetry2 := mergedTelemetry1.Merge(inputTelemetry2) + must.Eq(t, mergedTelemetry2.inMemoryCollectionInterval, 1*time.Second) + must.Eq(t, mergedTelemetry2.inMemoryRetentionPeriod, 10*time.Second) +} From d64d04fb2206f7ec0692e05260d6480ba6e188fa Mon Sep 17 00:00:00 2001 From: James Rasell Date: Thu, 21 Mar 2024 09:14:05 +0000 Subject: [PATCH 3/4] docs: add website config docs and changelog entry --- .changelog/20166.txt | 3 +++ website/content/docs/configuration/telemetry.mdx | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 .changelog/20166.txt diff --git a/.changelog/20166.txt b/.changelog/20166.txt new file mode 100644 index 00000000000..94923297b57 --- /dev/null +++ b/.changelog/20166.txt @@ -0,0 +1,3 @@ +```release-note:bug +agent: allow configuration of in-memory telemetry sink +``` diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index bd6170f6bff..c6e299bdd99 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -33,6 +33,16 @@ parameters on this page are grouped by the telemetry provider. The following options are available on all telemetry configurations. +- `in_memory_collection_interval` `(duration: 10s)` Configures the in-memory + sink collection interval. This sink is always configured and backs the JSON + metrics API endpoint. This option is particularly useful for debugging or + development purposes, where aggressive collection is required. + +- `in_memory_retention_period` `(duration: 1m)` Configures the in-memory sink + retention period. This sink is always configured and backs the JSON metrics + API endpoint. This option is particularly useful for debugging or development + purposes. + - `disable_hostname` `(bool: false)` - Specifies if gauge values should be prefixed with the local hostname. From 308f0a1fa64b1331d8b3d5de0e1622196b97abc1 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Mon, 25 Mar 2024 09:06:42 +0000 Subject: [PATCH 4/4] updates based on feedback. --- command/agent/command.go | 15 +-------------- command/agent/config.go | 16 +++++++++++----- command/agent/config_test.go | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/command/agent/command.go b/command/agent/command.go index 0edae2b0a6f..7acc63f089f 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -1170,20 +1170,7 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { telConfig = config.Telemetry } - // The values are defaulted in both the dev and default config object, so - // we should never have empty values. Do this, so it can never happen and - // cause agents to panic on start. - inMemInt := 10 * time.Second - inMemRet := 1 * time.Minute - - if telConfig.inMemoryCollectionInterval > 0 { - inMemInt = telConfig.inMemoryCollectionInterval - } - if telConfig.inMemoryRetentionPeriod > 0 { - inMemRet = telConfig.inMemoryRetentionPeriod - } - - inm := metrics.NewInmemSink(inMemInt, inMemRet) + inm := metrics.NewInmemSink(telConfig.inMemoryCollectionInterval, telConfig.inMemoryRetentionPeriod) metrics.DefaultInmemSignal(inm) metricsConf := metrics.DefaultConfig("nomad") diff --git a/command/agent/config.go b/command/agent/config.go index 0339d17e43b..964764ee394 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1101,11 +1101,17 @@ func (t *Telemetry) Validate() error { return nil } - // Ensure the in-memory durations are set correctly. - if t.inMemoryRetentionPeriod > 0 && t.inMemoryCollectionInterval > 0 { - if t.inMemoryCollectionInterval > t.inMemoryRetentionPeriod { - return errors.New("telemetry in-memory collection interval cannot be greater than retention period") - } + // Ensure we have durations that are greater than zero. + if t.inMemoryCollectionInterval <= 0 { + return errors.New("telemetry in-memory collection interval must be greater than zero") + } + if t.inMemoryRetentionPeriod <= 0 { + return errors.New("telemetry in-memory retention period must be greater than zero") + } + + // Ensure the in-memory durations do not conflict. + if t.inMemoryCollectionInterval > t.inMemoryRetentionPeriod { + return errors.New("telemetry in-memory collection interval cannot be greater than retention period") } return nil diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 9b89c88b95d..692acac38dd 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -1415,6 +1415,20 @@ func TestTelemetry_Validate(t *testing.T) { }, expectedError: nil, }, + { + name: "missing in-memory interval", + inputTelemetry: &Telemetry{ + inMemoryRetentionPeriod: 10 * time.Second, + }, + expectedError: errors.New("telemetry in-memory collection interval must be greater than zero"), + }, + { + name: "missing in-memory collection", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 10 * time.Second, + }, + expectedError: errors.New("telemetry in-memory retention period must be greater than zero"), + }, } for _, tc := range testCases {