Skip to content

Commit

Permalink
Merge pull request #1501 from hashicorp/f-stats-opt-in
Browse files Browse the repository at this point in the history
Allow operators to opt into publishing node and alloc metrics
  • Loading branch information
diptanu authored Aug 4, 2016
2 parents b140b03 + 65d503a commit ece6251
Show file tree
Hide file tree
Showing 9 changed files with 40 additions and 15 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## 0.4.1 (UNRELEASED)

__BACKWARDS INCOMPATIBILITIES:__
* telemetry: Operators will have to explicitly opt-in for Nomad client to
publish allocation and node metrics

IMPROVEMENTS:
* core: Allow count 0 on system jobs [GH-1421]
* core: Gracefully handle short lived outages by holding RPC calls [GH-1403]
Expand All @@ -15,6 +19,7 @@ IMPROVEMENTS:
* client: Fingerprint network speed on Windows [GH-1443]
* driver/docker: Allow working directory to be configured [GH-1513]
* telemetry: Circonus integration for telemetry metrics [GH-1459]
* telemetry: Allow operators to opt-in for publishing metrics [GH-1501]

BUG FIXES:
* core: Sanitize empty slices/maps in jobs to avoid incorrect create/destroy
Expand Down
6 changes: 5 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1397,7 +1397,11 @@ func (c *Client) collectHostStats() {
c.resourceUsageLock.Lock()
c.resourceUsage = ru
c.resourceUsageLock.Unlock()
c.emitStats(ru)

// Publish Node metrics if operator has opted in
if c.config.PublishNodeMetrics {
c.emitStats(ru)
}
case <-c.shutdownCh:
return
}
Expand Down
8 changes: 8 additions & 0 deletions client/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ type Config struct {
// StatsCollectionInterval is the interval at which the Nomad client
// collects resource usage stats
StatsCollectionInterval time.Duration

// PublishNodeMetrics determines whether nomad is going to publish node
// level metrics to remote Telemetry sinks
PublishNodeMetrics bool

// PublishAllocationMetrics determines whether nomad is going to publish
// allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool
}

func (c *Config) Copy() *Config {
Expand Down
6 changes: 2 additions & 4 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ func (r *TaskRunner) Destroy() {
// emitStats emits resource usage stats of tasks to remote metrics collector
// sinks
func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
if ru.ResourceUsage.MemoryStats != nil {
if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
Expand All @@ -649,14 +649,12 @@ func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
}

if ru.ResourceUsage.CpuStats != nil {
if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
}

//TODO Add Pid stats when we add an API to enable/disable them
}
2 changes: 2 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {

conf.ConsulConfig = a.config.Consul
conf.StatsCollectionInterval = a.config.Telemetry.collectionInterval
conf.PublishNodeMetrics = a.config.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = a.config.Telemetry.PublishAllocationMetrics
return conf, nil
}

Expand Down
2 changes: 2 additions & 0 deletions command/agent/config-test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ telemetry {
statsd_address = "127.0.0.1:2345"
disable_hostname = true
collection_interval = "3s"
publish_allocation_metrics = true
publish_node_metrics = true
}
leave_on_interrupt = true
leave_on_terminate = true
Expand Down
12 changes: 7 additions & 5 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,13 @@ type ServerConfig struct {

// Telemetry is the telemetry configuration for the server
type Telemetry struct {
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
StatsiteAddr string `mapstructure:"statsite_address"`
StatsdAddr string `mapstructure:"statsd_address"`
DisableHostname bool `mapstructure:"disable_hostname"`
CollectionInterval string `mapstructure:"collection_interval"`
collectionInterval time.Duration `mapstructure:"-"`
PublishAllocationMetrics bool `mapstructure:"publish_allocation_metrics"`
PublishNodeMetrics bool `mapstructure:"publish_node_metrics"`

// Circonus: see https://github.com/circonus-labs/circonus-gometrics
// for more details on the various configuration options.
Expand Down
2 changes: 2 additions & 0 deletions command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,8 @@ func parseTelemetry(result **Telemetry, list *ast.ObjectList) error {
"statsd_address",
"disable_hostname",
"collection_interval",
"publish_allocation_metrics",
"publish_node_metrics",
"circonus_api_token",
"circonus_api_app",
"circonus_api_url",
Expand Down
12 changes: 7 additions & 5 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,13 @@ func TestConfig_Parse(t *testing.T) {
RetryMaxAttempts: 3,
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
StatsiteAddr: "127.0.0.1:1234",
StatsdAddr: "127.0.0.1:2345",
DisableHostname: true,
CollectionInterval: "3s",
collectionInterval: 3 * time.Second,
PublishAllocationMetrics: true,
PublishNodeMetrics: true,
},
LeaveOnInt: true,
LeaveOnTerm: true,
Expand Down

0 comments on commit ece6251

Please sign in to comment.