diff --git a/.changelog/23964.txt b/.changelog/23964.txt new file mode 100644 index 00000000000..c3d488f9ada --- /dev/null +++ b/.changelog/23964.txt @@ -0,0 +1,3 @@ +```release-note:improvement +metrics: introduce client config to include alloc metadata as part of the base labels +``` diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 21f2b3dd906..5d74ab03937 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -510,6 +510,20 @@ func (tr *TaskRunner) initLabels() { }, } + if tr.clientConfig.IncludeAllocMetadataInMetrics { + combined := alloc.Job.CombinedTaskMeta(alloc.TaskGroup, tr.taskName) + for meta, metaValue := range combined { + if len(tr.clientConfig.AllowedMetadataKeysInMetrics) > 0 && !slices.Contains(tr.clientConfig.AllowedMetadataKeysInMetrics, meta) { + continue + } + + tr.baseLabels = append(tr.baseLabels, metrics.Label{ + Name: strings.ReplaceAll(meta, "-", "_"), + Value: metaValue, + }) + } + } + if tr.alloc.Job.ParentID != "" { tr.baseLabels = append(tr.baseLabels, metrics.Label{ Name: "parent_id", diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index d7fdbe3dc0f..ee29756519b 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -2866,6 +2866,40 @@ func TestTaskRunner_BaseLabels(t *testing.T) { require.Equal(alloc.Namespace, labels["namespace"]) } +// TestTaskRunner_BaseLabels_IncludesAllocMetadata tests that the base labels include +// the allocation metadata fields using the provided allowed list of keys +func TestTaskRunner_BaseLabels_IncludesAllocMetadata(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + alloc.Namespace = "not-default" + job := alloc.Job + job.Meta = map[string]string{"owner": "HashiCorp", "my-key": "my-value", "some_dynamic_value": "now()"} + task := job.TaskGroups[0].Tasks[0] + task.Driver = "raw_exec" + task.Config = map[string]interface{}{ + "command": "whoami", + } + + trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name, nil) + defer cleanup() + + trConfig.ClientConfig.IncludeAllocMetadataInMetrics = true + trConfig.ClientConfig.AllowedMetadataKeysInMetrics = []string{"owner", "my-key"} + + tr, err := NewTaskRunner(trConfig) + must.NoError(t, err) + + labels := map[string]string{} + for _, e := range tr.baseLabels { + labels[e.Name] = e.Value + } + + must.Eq(t, "HashiCorp", labels["owner"]) + must.Eq(t, "my-value", labels["my_key"]) + must.MapNotContainsKey(t, labels, "some_dynamic_value") +} + // TestTaskRunner_IdentityHook_Enabled asserts that the identity hook exposes a // workload identity to a task. func TestTaskRunner_IdentityHook_Enabled(t *testing.T) { diff --git a/client/config/config.go b/client/config/config.go index 86e28c2cbbc..89bb0a0a956 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -203,6 +203,14 @@ type Config struct { // allocation metrics to remote Telemetry sinks PublishAllocationMetrics bool + // IncludeAllocMetadataInMetrics determines whether nomad should include the + // allocation metadata as labels in the metrics to remote Telemetry sinks + IncludeAllocMetadataInMetrics bool + + // AllowedMetadataKeysInMetrics when provided nomad will only include the + // configured metadata keys as part of the metrics to remote Telemetry sinks + AllowedMetadataKeysInMetrics []string + // TLSConfig holds various TLS related configurations TLSConfig *structsc.TLSConfig diff --git a/command/agent/agent.go b/command/agent/agent.go index f9dc2e69e8b..48aa503c904 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -861,6 +861,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics + conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics + conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics // Set the TLS related configs conf.TLSConfig = agentConfig.TLSConfig diff --git a/command/agent/config.go b/command/agent/config.go index ab79f196454..2c8aa514ce0 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -969,17 +969,19 @@ type Telemetry struct { InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"` inMemoryRetentionPeriod time.Duration `hcl:"-"` - StatsiteAddr string `hcl:"statsite_address"` - StatsdAddr string `hcl:"statsd_address"` - DataDogAddr string `hcl:"datadog_address"` - DataDogTags []string `hcl:"datadog_tags"` - PrometheusMetrics bool `hcl:"prometheus_metrics"` - DisableHostname bool `hcl:"disable_hostname"` - UseNodeName bool `hcl:"use_node_name"` - CollectionInterval string `hcl:"collection_interval"` - collectionInterval time.Duration `hcl:"-"` - PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"` - PublishNodeMetrics bool `hcl:"publish_node_metrics"` + StatsiteAddr string `hcl:"statsite_address"` + StatsdAddr string `hcl:"statsd_address"` + DataDogAddr string `hcl:"datadog_address"` + DataDogTags []string `hcl:"datadog_tags"` + PrometheusMetrics bool `hcl:"prometheus_metrics"` + DisableHostname bool `hcl:"disable_hostname"` + UseNodeName bool `hcl:"use_node_name"` + CollectionInterval string `hcl:"collection_interval"` + collectionInterval time.Duration `hcl:"-"` + PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"` + PublishNodeMetrics bool `hcl:"publish_node_metrics"` + IncludeAllocMetadataInMetrics bool `hcl:"include_alloc_metadata_in_metrics"` + AllowedMetadataKeysInMetrics []string `hcl:"allowed_metadata_keys_in_metrics"` // PrefixFilter allows for filtering out metrics from being collected PrefixFilter []string `hcl:"prefix_filter"` @@ -1343,6 +1345,8 @@ func DevConfig(mode *devModeConfig) *Config { conf.Telemetry.PrometheusMetrics = true conf.Telemetry.PublishAllocationMetrics = true conf.Telemetry.PublishNodeMetrics = true + conf.Telemetry.IncludeAllocMetadataInMetrics = true + conf.Telemetry.AllowedMetadataKeysInMetrics = []string{} if mode.consulMode { conf.Consuls[0].ServiceIdentity = &config.WorkloadIdentityConfig{ @@ -2524,6 +2528,10 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry { if b.PublishAllocationMetrics { result.PublishAllocationMetrics = true } + if b.IncludeAllocMetadataInMetrics { + result.IncludeAllocMetadataInMetrics = true + } + result.AllowedMetadataKeysInMetrics = append(result.AllowedMetadataKeysInMetrics, b.AllowedMetadataKeysInMetrics...) if b.CirconusAPIToken != "" { result.CirconusAPIToken = b.CirconusAPIToken } diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index 071fdc3cafb..2899580b6ee 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -58,6 +58,14 @@ The following options are available on all telemetry configurations. - `publish_allocation_metrics` `(bool: false)` - Specifies if Nomad should publish runtime metrics of allocations. +- `include_alloc_metadata_in_metrics` `(bool: false)` - This controls whether + allocation metadata is included in metric labels. Enabling this option may result in + high cardinality labels. You should also configure [allowed_metadata_keys_in_metrics](#allowed_metadata_keys_in_metrics). + +- `allowed_metadata_keys_in_metrics` `(list: [])` - This filters the metadata + keys to be included in the metric publishing. By default it does not filter + out any keys and thus include all metadata. + - `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish runtime metrics of nodes.