From 4985d9edcb3c6b352f6f57b8b213650291c99ee4 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Mon, 25 Mar 2024 10:19:08 -0500 Subject: [PATCH] backport of commit facc3e8013fb5d2ef63474cda127dc6bb91155b4 (#20217) Co-authored-by: James Rasell --- .changelog/20166.txt | 3 + command/agent/command.go | 16 +++-- command/agent/config.go | 66 +++++++++++++++++-- command/agent/config_parse.go | 2 + command/agent/config_parse_test.go | 42 +++++++++--- command/agent/config_test.go | 58 ++++++++++++++++ command/agent/testdata/basic.hcl | 16 +++-- command/agent/testdata/basic.json | 14 ++-- .../content/docs/configuration/telemetry.mdx | 10 +++ 9 files changed, 193 insertions(+), 34 deletions(-) create mode 100644 .changelog/20166.txt diff --git a/.changelog/20166.txt b/.changelog/20166.txt new file mode 100644 index 00000000000..94923297b57 --- /dev/null +++ b/.changelog/20166.txt @@ -0,0 +1,3 @@ +```release-note:bug +agent: allow configuration of in-memory telemetry sink +``` diff --git a/command/agent/command.go b/command/agent/command.go index 8cc58f2c3ef..4b3f09dbe77 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -351,6 +351,11 @@ func (c *Command) IsValidConfig(config, cmdConfig *Config) bool { return false } + if err := config.Telemetry.Validate(); err != nil { + c.Ui.Error(fmt.Sprintf("telemetry block invalid: %v", err)) + return false + } + // Set up the TLS configuration properly if we have one. // XXX chelseakomlo: set up a TLSConfig New method which would wrap // constructor-type actions like this. @@ -1149,14 +1154,8 @@ func (c *Command) handleReload() { } } -// setupTelemetry is used ot setup the telemetry sub-systems +// setupTelemetry is used to set up the telemetry sub-systems. func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { - /* Setup telemetry - Aggregate on 10 second intervals for 1 minute. Expose the - metrics over stderr when there is a SIGUSR1 received. - */ - inm := metrics.NewInmemSink(10*time.Second, time.Minute) - metrics.DefaultInmemSignal(inm) var telConfig *Telemetry if config.Telemetry == nil { @@ -1165,6 +1164,9 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { telConfig = config.Telemetry } + inm := metrics.NewInmemSink(telConfig.inMemoryCollectionInterval, telConfig.inMemoryRetentionPeriod) + metrics.DefaultInmemSignal(inm) + metricsConf := metrics.DefaultConfig("nomad") metricsConf.EnableHostname = !telConfig.DisableHostname diff --git a/command/agent/config.go b/command/agent/config.go index 21eeb80ced3..29eef600afe 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -935,6 +935,20 @@ func (s *ServerConfig) EncryptBytes() ([]byte, error) { // Telemetry is the telemetry configuration for the server type Telemetry struct { + + // InMemoryCollectionInterval configures the in-memory sink collection + // interval. This sink is always configured and backs the JSON metrics API + // endpoint. This option is particularly useful for debugging or + // development. + InMemoryCollectionInterval string `hcl:"in_memory_collection_interval"` + inMemoryCollectionInterval time.Duration `hcl:"-"` + + // InMemoryRetentionPeriod configures the in-memory sink retention period + // This sink is always configured and backs the JSON metrics API endpoint. + // This option is particularly useful for debugging or development. + InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"` + inMemoryRetentionPeriod time.Duration `hcl:"-"` + StatsiteAddr string `hcl:"statsite_address"` StatsdAddr string `hcl:"statsd_address"` DataDogAddr string `hcl:"datadog_address"` @@ -1055,8 +1069,8 @@ func (t *Telemetry) Copy() *Telemetry { } // PrefixFilters parses the PrefixFilter field and returns a list of allowed and blocked filters -func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { - for _, rule := range a.PrefixFilter { +func (t *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { + for _, rule := range t.PrefixFilter { if rule == "" { continue } @@ -1072,6 +1086,30 @@ func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) { return allowed, blocked, nil } +// Validate the telemetry configuration options. These are used by the agent, +// regardless of mode, so can live here rather than a structs package. It is +// safe to call, without checking whether the config object is nil first. +func (t *Telemetry) Validate() error { + if t == nil { + return nil + } + + // Ensure we have durations that are greater than zero. + if t.inMemoryCollectionInterval <= 0 { + return errors.New("telemetry in-memory collection interval must be greater than zero") + } + if t.inMemoryRetentionPeriod <= 0 { + return errors.New("telemetry in-memory retention period must be greater than zero") + } + + // Ensure the in-memory durations do not conflict. + if t.inMemoryCollectionInterval > t.inMemoryRetentionPeriod { + return errors.New("telemetry in-memory collection interval cannot be greater than retention period") + } + + return nil +} + // Ports encapsulates the various ports we bind to for network services. If any // are not specified then the defaults are used instead. type Ports struct { @@ -1383,8 +1421,12 @@ func DefaultConfig() *Config { }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ - CollectionInterval: "1s", - collectionInterval: 1 * time.Second, + InMemoryCollectionInterval: "10s", + inMemoryCollectionInterval: 10 * time.Second, + InMemoryRetentionPeriod: "1m", + inMemoryRetentionPeriod: 1 * time.Minute, + CollectionInterval: "1s", + collectionInterval: 1 * time.Second, }, TLSConfig: &config.TLSConfig{}, Sentinel: &config.SentinelConfig{}, @@ -2359,9 +2401,21 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { } // Merge is used to merge two telemetry configs together -func (a *Telemetry) Merge(b *Telemetry) *Telemetry { - result := *a +func (t *Telemetry) Merge(b *Telemetry) *Telemetry { + result := *t + if b.InMemoryCollectionInterval != "" { + result.InMemoryCollectionInterval = b.InMemoryCollectionInterval + } + if b.inMemoryCollectionInterval != 0 { + result.inMemoryCollectionInterval = b.inMemoryCollectionInterval + } + if b.InMemoryRetentionPeriod != "" { + result.InMemoryRetentionPeriod = b.InMemoryRetentionPeriod + } + if b.inMemoryRetentionPeriod != 0 { + result.inMemoryRetentionPeriod = b.inMemoryRetentionPeriod + } if b.StatsiteAddr != "" { result.StatsiteAddr = b.StatsiteAddr } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 859db5db53e..0ca36170446 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -109,6 +109,8 @@ func ParseConfigFile(path string) (*Config, error) { {"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil}, {"autopilot.server_stabilization_time", &c.Autopilot.ServerStabilizationTime, &c.Autopilot.ServerStabilizationTimeHCL, nil}, {"autopilot.last_contact_threshold", &c.Autopilot.LastContactThreshold, &c.Autopilot.LastContactThresholdHCL, nil}, + {"telemetry.in_memory_collection_interval", &c.Telemetry.inMemoryCollectionInterval, &c.Telemetry.InMemoryCollectionInterval, nil}, + {"telemetry.in_memory_retention_period", &c.Telemetry.inMemoryRetentionPeriod, &c.Telemetry.InMemoryRetentionPeriod, nil}, {"telemetry.collection_interval", &c.Telemetry.collectionInterval, &c.Telemetry.CollectionInterval, nil}, {"client.template.block_query_wait", nil, &c.Client.TemplateConfig.BlockQueryWaitTimeHCL, func(d *time.Duration) { diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 6231125b90c..3854eed378d 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -196,15 +196,19 @@ var basicConfig = &Config{ }, }, Telemetry: &Telemetry{ - StatsiteAddr: "127.0.0.1:1234", - StatsdAddr: "127.0.0.1:2345", - PrometheusMetrics: true, - DisableHostname: true, - UseNodeName: false, - CollectionInterval: "3s", - collectionInterval: 3 * time.Second, - PublishAllocationMetrics: true, - PublishNodeMetrics: true, + StatsiteAddr: "127.0.0.1:1234", + StatsdAddr: "127.0.0.1:2345", + PrometheusMetrics: true, + DisableHostname: true, + UseNodeName: false, + InMemoryCollectionInterval: "1m", + inMemoryCollectionInterval: 1 * time.Minute, + InMemoryRetentionPeriod: "24h", + inMemoryRetentionPeriod: 24 * time.Hour, + CollectionInterval: "3s", + collectionInterval: 3 * time.Second, + PublishAllocationMetrics: true, + PublishNodeMetrics: true, }, LeaveOnInt: true, LeaveOnTerm: true, @@ -1079,3 +1083,23 @@ func TestConfig_MultipleConsul(t *testing.T) { }) } } + +func TestConfig_Telemetry(t *testing.T) { + ci.Parallel(t) + + // Ensure merging a mostly empty struct correctly inherits default values + // set. + inputTelemetry1 := &Telemetry{PrometheusMetrics: true} + mergedTelemetry1 := DefaultConfig().Telemetry.Merge(inputTelemetry1) + must.Eq(t, mergedTelemetry1.inMemoryCollectionInterval, 10*time.Second) + must.Eq(t, mergedTelemetry1.inMemoryRetentionPeriod, 1*time.Minute) + + // Ensure we can then overlay user specified data. + inputTelemetry2 := &Telemetry{ + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + } + mergedTelemetry2 := mergedTelemetry1.Merge(inputTelemetry2) + must.Eq(t, mergedTelemetry2.inMemoryCollectionInterval, 1*time.Second) + must.Eq(t, mergedTelemetry2.inMemoryRetentionPeriod, 10*time.Second) +} diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 6c8c1d4639f..692acac38dd 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -4,6 +4,7 @@ package agent import ( + "errors" "fmt" "net" "os" @@ -1385,6 +1386,63 @@ func TestTelemetry_PrefixFilters(t *testing.T) { } } +func TestTelemetry_Validate(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + inputTelemetry *Telemetry + expectedError error + }{ + { + name: "nil", + inputTelemetry: nil, + expectedError: nil, + }, + { + name: "invalid", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 10 * time.Second, + inMemoryRetentionPeriod: 1 * time.Second, + }, + expectedError: errors.New("telemetry in-memory collection interval cannot be greater than retention period"), + }, + { + name: "valid", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + }, + expectedError: nil, + }, + { + name: "missing in-memory interval", + inputTelemetry: &Telemetry{ + inMemoryRetentionPeriod: 10 * time.Second, + }, + expectedError: errors.New("telemetry in-memory collection interval must be greater than zero"), + }, + { + name: "missing in-memory collection", + inputTelemetry: &Telemetry{ + inMemoryCollectionInterval: 10 * time.Second, + }, + expectedError: errors.New("telemetry in-memory retention period must be greater than zero"), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actualError := tc.inputTelemetry.Validate() + if tc.expectedError != nil { + must.EqError(t, actualError, tc.expectedError.Error()) + } else { + must.NoError(t, actualError) + } + }) + } +} + func TestTelemetry_Parse(t *testing.T) { ci.Parallel(t) diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index d3a27c64b2c..f9d88d0a379 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -199,13 +199,15 @@ audit { } telemetry { - statsite_address = "127.0.0.1:1234" - statsd_address = "127.0.0.1:2345" - prometheus_metrics = true - disable_hostname = true - collection_interval = "3s" - publish_allocation_metrics = true - publish_node_metrics = true + in_memory_collection_interval = "1m" + in_memory_retention_period = "24h" + statsite_address = "127.0.0.1:1234" + statsd_address = "127.0.0.1:2345" + prometheus_metrics = true + disable_hostname = true + collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true } leave_on_interrupt = true diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index 14f07eea029..4804e6a9c79 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -169,7 +169,6 @@ "server_service_name": "nomad", "service_auth_method": "nomad-services", "task_auth_method": "nomad-tasks", - "service_identity": { "aud": [ "consul.io", @@ -360,6 +359,8 @@ "syslog_facility": "LOCAL1", "telemetry": [ { + "in_memory_collection_interval": "1m", + "in_memory_retention_period": "24h", "collection_interval": "3s", "disable_hostname": true, "prometheus_metrics": true, @@ -393,7 +394,10 @@ "cert_file": "/path/to/cert/file", "create_from_role": "test_role", "default_identity": { - "aud": ["vault.io", "nomad.io"], + "aud": [ + "vault.io", + "nomad.io" + ], "env": false, "file": true, "ttl": "3h" @@ -407,9 +411,9 @@ "token": "12345" } ], - "reporting":{ - "license":{ - "enabled":"true" + "reporting": { + "license": { + "enabled": "true" } } } diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index b99918d4f30..d76a07b9f4d 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -33,6 +33,16 @@ parameters on this page are grouped by the telemetry provider. The following options are available on all telemetry configurations. +- `in_memory_collection_interval` `(duration: 10s)` Configures the in-memory + sink collection interval. This sink is always configured and backs the JSON + metrics API endpoint. This option is particularly useful for debugging or + development purposes, where aggressive collection is required. + +- `in_memory_retention_period` `(duration: 1m)` Configures the in-memory sink + retention period. This sink is always configured and backs the JSON metrics + API endpoint. This option is particularly useful for debugging or development + purposes. + - `disable_hostname` `(bool: false)` - Specifies if gauge values should be prefixed with the local hostname.