Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: introduce client config to include meta as part of the base label #23964

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/23964.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
metrics: introduce client config to include alloc metadata as part of the base labels
```
14 changes: 14 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,20 @@ func (tr *TaskRunner) initLabels() {
},
}

if tr.clientConfig.IncludeAllocMetadataInMetrics {
combined := alloc.Job.CombinedTaskMeta(alloc.TaskGroup, tr.taskName)
for meta, metaValue := range combined {
if len(tr.clientConfig.AllowedMetadataKeysInMetrics) > 0 && !slices.Contains(tr.clientConfig.AllowedMetadataKeysInMetrics, meta) {
continue
}

tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: strings.ReplaceAll(meta, "-", "_"),
Value: metaValue,
})
}
}

if tr.alloc.Job.ParentID != "" {
tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: "parent_id",
Expand Down
34 changes: 34 additions & 0 deletions client/allocrunner/taskrunner/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2866,6 +2866,40 @@ func TestTaskRunner_BaseLabels(t *testing.T) {
require.Equal(alloc.Namespace, labels["namespace"])
}

// TestTaskRunner_BaseLabels_IncludesAllocMetadata tests that the base labels include
// the allocation metadata fields using the provided allowed list of keys
func TestTaskRunner_BaseLabels_IncludesAllocMetadata(t *testing.T) {
ci.Parallel(t)

alloc := mock.BatchAlloc()
alloc.Namespace = "not-default"
job := alloc.Job
job.Meta = map[string]string{"owner": "HashiCorp", "my-key": "my-value", "some_dynamic_value": "now()"}
task := job.TaskGroups[0].Tasks[0]
task.Driver = "raw_exec"
task.Config = map[string]interface{}{
"command": "whoami",
}

trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name, nil)
defer cleanup()

trConfig.ClientConfig.IncludeAllocMetadataInMetrics = true
trConfig.ClientConfig.AllowedMetadataKeysInMetrics = []string{"owner", "my-key"}

tr, err := NewTaskRunner(trConfig)
must.NoError(t, err)

labels := map[string]string{}
for _, e := range tr.baseLabels {
labels[e.Name] = e.Value
}

must.Eq(t, "HashiCorp", labels["owner"])
must.Eq(t, "my-value", labels["my_key"])
must.MapNotContainsKey(t, labels, "some_dynamic_value")
}

// TestTaskRunner_IdentityHook_Enabled asserts that the identity hook exposes a
// workload identity to a task.
func TestTaskRunner_IdentityHook_Enabled(t *testing.T) {
Expand Down
8 changes: 8 additions & 0 deletions client/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,14 @@ type Config struct {
// allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool

// IncludeAllocMetadataInMetrics determines whether nomad should include the
// allocation metadata as labels in the metrics to remote Telemetry sinks
IncludeAllocMetadataInMetrics bool

// AllowedMetadataKeysInMetrics when provided nomad will only include the
// configured metadata keys as part of the metrics to remote Telemetry sinks
AllowedMetadataKeysInMetrics []string

// TLSConfig holds various TLS related configurations
TLSConfig *structsc.TLSConfig

Expand Down
2 changes: 2 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) {
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics
conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics
conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics

// Set the TLS related configs
conf.TLSConfig = agentConfig.TLSConfig
Expand Down
30 changes: 19 additions & 11 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -969,17 +969,19 @@ type Telemetry struct {
InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"`
inMemoryRetentionPeriod time.Duration `hcl:"-"`

StatsiteAddr string `hcl:"statsite_address"`
StatsdAddr string `hcl:"statsd_address"`
DataDogAddr string `hcl:"datadog_address"`
DataDogTags []string `hcl:"datadog_tags"`
PrometheusMetrics bool `hcl:"prometheus_metrics"`
DisableHostname bool `hcl:"disable_hostname"`
UseNodeName bool `hcl:"use_node_name"`
CollectionInterval string `hcl:"collection_interval"`
collectionInterval time.Duration `hcl:"-"`
PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"`
PublishNodeMetrics bool `hcl:"publish_node_metrics"`
StatsiteAddr string `hcl:"statsite_address"`
StatsdAddr string `hcl:"statsd_address"`
DataDogAddr string `hcl:"datadog_address"`
DataDogTags []string `hcl:"datadog_tags"`
PrometheusMetrics bool `hcl:"prometheus_metrics"`
DisableHostname bool `hcl:"disable_hostname"`
UseNodeName bool `hcl:"use_node_name"`
CollectionInterval string `hcl:"collection_interval"`
collectionInterval time.Duration `hcl:"-"`
PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"`
PublishNodeMetrics bool `hcl:"publish_node_metrics"`
IncludeAllocMetadataInMetrics bool `hcl:"include_alloc_metadata_in_metrics"`
AllowedMetadataKeysInMetrics []string `hcl:"allowed_metadata_keys_in_metrics"`

// PrefixFilter allows for filtering out metrics from being collected
PrefixFilter []string `hcl:"prefix_filter"`
Expand Down Expand Up @@ -1343,6 +1345,8 @@ func DevConfig(mode *devModeConfig) *Config {
conf.Telemetry.PrometheusMetrics = true
conf.Telemetry.PublishAllocationMetrics = true
conf.Telemetry.PublishNodeMetrics = true
conf.Telemetry.IncludeAllocMetadataInMetrics = true
conf.Telemetry.AllowedMetadataKeysInMetrics = []string{}

if mode.consulMode {
conf.Consuls[0].ServiceIdentity = &config.WorkloadIdentityConfig{
Expand Down Expand Up @@ -2524,6 +2528,10 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
if b.PublishAllocationMetrics {
result.PublishAllocationMetrics = true
}
if b.IncludeAllocMetadataInMetrics {
result.IncludeAllocMetadataInMetrics = true
}
result.AllowedMetadataKeysInMetrics = append(result.AllowedMetadataKeysInMetrics, b.AllowedMetadataKeysInMetrics...)
if b.CirconusAPIToken != "" {
result.CirconusAPIToken = b.CirconusAPIToken
}
Expand Down
8 changes: 8 additions & 0 deletions website/content/docs/configuration/telemetry.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ The following options are available on all telemetry configurations.
- `publish_allocation_metrics` `(bool: false)` - Specifies if Nomad should
publish runtime metrics of allocations.

- `include_alloc_metadata_in_metrics` `(bool: false)` - This controls whether
allocation metadata is included in metric labels. Enabling this option may result in
high cardinality labels. You should also configure [allowed_metadata_keys_in_metrics](#allowed_metadata_keys_in_metrics).

- `allowed_metadata_keys_in_metrics` `(list: [])` - This filters the metadata
keys to be included in the metric publishing. By default it does not filter
out any keys and thus include all metadata.

- `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish
runtime metrics of nodes.

Expand Down