diff --git a/CHANGELOG.md b/CHANGELOG.md index 32eb52d7800..9527ce8f46b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## 0.4.1 (UNRELEASED) +__BACKWARDS INCOMPATIBILITIES:__ + * telemetry: Operators will have to explicitly opt-in for Nomad client to + publish allocation and node metrics + IMPROVEMENTS: * core: Allow count 0 on system jobs [GH-1421] * core: Gracefully handle short lived outages by holding RPC calls [GH-1403] @@ -11,6 +15,7 @@ IMPROVEMENTS: * client: Add killing event to task state [GH-1457] * client: Fingerprint network speed on Windows [GH-1443] * telemetry: Circonus integration for telemetry metrics [GH-1459] + * telemetry: Allow operators to opt-in for publishing metrics [GH-1501] BUG FIXES: * core: Sanitize empty slices/maps in jobs to avoid incorrect create/destroy diff --git a/client/client.go b/client/client.go index 7cfb4161a97..a8fe2be7806 100644 --- a/client/client.go +++ b/client/client.go @@ -1397,7 +1397,11 @@ func (c *Client) collectHostStats() { c.resourceUsageLock.Lock() c.resourceUsage = ru c.resourceUsageLock.Unlock() - c.emitStats(ru) + + // Publish Node metrics if operator has opted in + if c.config.PublishNodeMetrics { + c.emitStats(ru) + } case <-c.shutdownCh: return } diff --git a/client/config/config.go b/client/config/config.go index 127926115cd..3c968480ac3 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -117,6 +117,14 @@ type Config struct { // StatsCollectionInterval is the interval at which the Nomad client // collects resource usage stats StatsCollectionInterval time.Duration + + // PublishNodeMetrics determines whether nomad is going to publish node + // level metrics to remote Telemetry sinks + PublishNodeMetrics bool + + // PublishAllocationMetrics determines whether nomad is going to publish + // allocation metrics to remote Telemetry sinks + PublishAllocationMetrics bool } func (c *Config) Copy() *Config { diff --git a/client/task_runner.go b/client/task_runner.go index 85a842d86d1..67edbbc8af7 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -640,7 +640,7 @@ func (r *TaskRunner) Destroy() { // emitStats emits resource usage stats of tasks to remote metrics collector // sinks func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { - if ru.ResourceUsage.MemoryStats != nil { + if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) @@ -649,7 +649,7 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) } - if ru.ResourceUsage.CpuStats != nil { + if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) @@ -657,6 +657,4 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) } - - //TODO Add Pid stats when we add an API to enable/disable them } diff --git a/command/agent/agent.go b/command/agent/agent.go index 2f9701ce294..5b5bfaa1e67 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -350,6 +350,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.ConsulConfig = a.config.Consul conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval + conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics + conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics return conf, nil } diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 1663d7d7072..8715ef98837 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -70,6 +70,8 @@ telemetry { statsd_address = "127.0.0.1:2345" disable_hostname = true collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true } leave_on_interrupt = true leave_on_terminate = true diff --git a/command/agent/config.go b/command/agent/config.go index b7c820bceb9..32bfd4d4209 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -240,11 +240,13 @@ type ServerConfig struct { // Telemetry is the telemetry configuration for the server type Telemetry struct { - StatsiteAddr string `mapstructure:"statsite_address"` - StatsdAddr string `mapstructure:"statsd_address"` - DisableHostname bool `mapstructure:"disable_hostname"` - CollectionInterval string `mapstructure:"collection_interval"` - collectionInterval time.Duration `mapstructure:"-"` + StatsiteAddr string `mapstructure:"statsite_address"` + StatsdAddr string `mapstructure:"statsd_address"` + DisableHostname bool `mapstructure:"disable_hostname"` + CollectionInterval string `mapstructure:"collection_interval"` + collectionInterval time.Duration `mapstructure:"-"` + PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"` + PublishNodeMetrics bool `mapstructure:"publish_node_metrics"` // Circonus: see https://github.com/circonus-labs/circonus-gometrics // for more details on the various configuration options. diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 6abacfb10fd..b3d3dd96949 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -493,6 +493,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error { "statsd_address", "disable_hostname", "collection_interval", + "publish_allocation_metrics", + "publish_node_metrics", "circonus_api_token", "circonus_api_app", "circonus_api_url", diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 04d00c43c21..61cf7a3b555 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -83,11 +83,13 @@ func TestConfig_Parse(t *testing.T) { RetryMaxAttempts: 3, }, Telemetry: &Telemetry{ - StatsiteAddr: "127.0.0.1:1234", - StatsdAddr: "127.0.0.1:2345", - DisableHostname: true, - CollectionInterval: "3s", - collectionInterval: 3 * time.Second, + StatsiteAddr: "127.0.0.1:1234", + StatsdAddr: "127.0.0.1:2345", + DisableHostname: true, + CollectionInterval: "3s", + collectionInterval: 3 * time.Second, + PublishAllocationMetrics: true, + PublishNodeMetrics: true, }, LeaveOnInt: true, LeaveOnTerm: true,