Skip to content

Commit

Permalink
Merge pull request #554 from jeremyje/osversion
Browse files Browse the repository at this point in the history
Add support for basic system metrics for Windows.
  • Loading branch information
k8s-ci-robot authored May 10, 2021
2 parents c7ce65d + d493387 commit 228f0f5
Show file tree
Hide file tree
Showing 19 changed files with 660 additions and 219 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ make clean windows-binaries
make test
# Run with containerd log monitoring enabled in Command Prompt. (Assumes containerd is installed.)
%CD%\bin\windows_amd64\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json
%CD%\output\windows_amd64\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
# Configure NPD to run as a Windows Service
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
Expand Down
94 changes: 94 additions & 0 deletions config/windows-system-stats-monitor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cpu": {
"metricsConfigs": {
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"system/cpu_stat": {
"displayName": "system/cpu_stat"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/procs_running": {
"displayName": "system/procs_running"
}
}
},
"disk": {
"includeAllAttachedBlk": false,
"includeRootBlk": false,
"lsblkTimeout": "60s",
"metricsConfigs": {
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
},
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
}
}
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"invokeInterval": "60s",
"memory": {
"metricsConfigs": {
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
}
}
}
}
11 changes: 11 additions & 0 deletions pkg/systemstatsmonitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,14 @@ Below metrics are collected from `net` component:
* `net/tx_compressed`: Cumulative count of compressed packets transmitted by the device driver.

All of the above have `interface_name` label for the net interface.

## Windows Support

NPD has preliminary support for system stats monitor. The following modules are supported:

* CPU - Idle, System, and User metrics.
* Memory - Used and available.
* Disk - Space used and free.
* Uptime - within kernel version and product name.

All the data is currently retried from the `github.com/shirou/gopsutil` library. Any data parsed directly from `/proc` from Linux is not supported on Windows. There will be later integration to use WMI (Windows Management Instrumentation) to gather node metrics.
62 changes: 0 additions & 62 deletions pkg/systemstatsmonitor/cpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,8 @@ limitations under the License.
package systemstatsmonitor

import (
"fmt"

"github.com/golang/glog"
"github.com/prometheus/procfs"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/load"

ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
"k8s.io/node-problem-detector/pkg/util/metrics"
Expand Down Expand Up @@ -174,24 +170,6 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
return &cc
}

func (cc *cpuCollector) recordLoad() {
if cc.mRunnableTaskCount == nil {
return
}

loadAvg, err := load.Avg()
if err != nil {
glog.Errorf("Failed to retrieve average CPU load: %v", err)
return
}

cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)

cc.mCpuLoad1m.Record(map[string]string{}, loadAvg.Load1)
cc.mCpuLoad5m.Record(map[string]string{}, loadAvg.Load5)
cc.mCpuLoad15m.Record(map[string]string{}, loadAvg.Load15)
}

func (cc *cpuCollector) recordUsage() {
if cc.mUsageTime == nil {
return
Expand Down Expand Up @@ -236,46 +214,6 @@ func (cc *cpuCollector) recordUsage() {
cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice
}

func (cc *cpuCollector) recordSystemStats() {
fs, err := procfs.NewFS("/proc")
stats, err := fs.Stat()
if err != nil {
glog.Errorf("Failed to retrieve cpu/process stats: %v", err)
return
}

cc.mSystemProcessesTotal.Record(map[string]string{}, int64(stats.ProcessCreated))
cc.mSystemProcsRunning.Record(map[string]string{}, int64(stats.ProcessesRunning))
cc.mSystemProcsBlocked.Record(map[string]string{}, int64(stats.ProcessesBlocked))
cc.mSystemInterruptsTotal.Record(map[string]string{}, int64(stats.IRQTotal))

for i, c := range stats.CPU {
tags := map[string]string{}
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)

tags[stageLabel] = "user"
cc.mSystemCPUStat.Record(tags, c.User)
tags[stageLabel] = "nice"
cc.mSystemCPUStat.Record(tags, c.Nice)
tags[stageLabel] = "system"
cc.mSystemCPUStat.Record(tags, c.System)
tags[stageLabel] = "idle"
cc.mSystemCPUStat.Record(tags, c.Idle)
tags[stageLabel] = "iowait"
cc.mSystemCPUStat.Record(tags, c.Iowait)
tags[stageLabel] = "iRQ"
cc.mSystemCPUStat.Record(tags, c.IRQ)
tags[stageLabel] = "softIRQ"
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
tags[stageLabel] = "steal"
cc.mSystemCPUStat.Record(tags, c.Steal)
tags[stageLabel] = "guest"
cc.mSystemCPUStat.Record(tags, c.Guest)
tags[stageLabel] = "guestNice"
cc.mSystemCPUStat.Record(tags, c.GuestNice)
}
}

func (cc *cpuCollector) collect() {
if cc == nil {
return
Expand Down
83 changes: 83 additions & 0 deletions pkg/systemstatsmonitor/cpu_collector_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package systemstatsmonitor

import (
"fmt"

"github.com/golang/glog"
"github.com/prometheus/procfs"
"github.com/shirou/gopsutil/load"
)

func (cc *cpuCollector) recordLoad() {
if cc.mRunnableTaskCount == nil {
return
}

loadAvg, err := load.Avg()
if err != nil {
glog.Errorf("Failed to retrieve average CPU load: %v", err)
return
}

cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)

cc.mCpuLoad1m.Record(map[string]string{}, loadAvg.Load1)
cc.mCpuLoad5m.Record(map[string]string{}, loadAvg.Load5)
cc.mCpuLoad15m.Record(map[string]string{}, loadAvg.Load15)
}

func (cc *cpuCollector) recordSystemStats() {
fs, err := procfs.NewFS("/proc")
stats, err := fs.Stat()
if err != nil {
glog.Errorf("Failed to retrieve cpu/process stats: %v", err)
return
}

cc.mSystemProcessesTotal.Record(map[string]string{}, int64(stats.ProcessCreated))
cc.mSystemProcsRunning.Record(map[string]string{}, int64(stats.ProcessesRunning))
cc.mSystemProcsBlocked.Record(map[string]string{}, int64(stats.ProcessesBlocked))
cc.mSystemInterruptsTotal.Record(map[string]string{}, int64(stats.IRQTotal))

for i, c := range stats.CPU {
tags := map[string]string{}
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)

tags[stageLabel] = "user"
cc.mSystemCPUStat.Record(tags, c.User)
tags[stageLabel] = "nice"
cc.mSystemCPUStat.Record(tags, c.Nice)
tags[stageLabel] = "system"
cc.mSystemCPUStat.Record(tags, c.System)
tags[stageLabel] = "idle"
cc.mSystemCPUStat.Record(tags, c.Idle)
tags[stageLabel] = "iowait"
cc.mSystemCPUStat.Record(tags, c.Iowait)
tags[stageLabel] = "iRQ"
cc.mSystemCPUStat.Record(tags, c.IRQ)
tags[stageLabel] = "softIRQ"
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
tags[stageLabel] = "steal"
cc.mSystemCPUStat.Record(tags, c.Steal)
tags[stageLabel] = "guest"
cc.mSystemCPUStat.Record(tags, c.Guest)
tags[stageLabel] = "guestNice"
cc.mSystemCPUStat.Record(tags, c.GuestNice)
}
}
72 changes: 72 additions & 0 deletions pkg/systemstatsmonitor/cpu_collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package systemstatsmonitor

import (
"encoding/json"
"testing"

ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
)

const (
fakeCPUConfig = `
{
"metricsConfigs": {
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"system/cpu_stat": {
"displayName": "system/cpu_stat"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/procs_running": {
"displayName": "system/procs_running"
}
}
}
`
)

func TestCpuCollector(t *testing.T) {
cfg := &ssmtypes.CPUStatsConfig{}
if err := json.Unmarshal([]byte(fakeCPUConfig), cfg); err != nil {
t.Fatalf("cannot load cpu config: %s", err)
}
mc := NewCPUCollectorOrDie(cfg)
mc.collect()
}
25 changes: 25 additions & 0 deletions pkg/systemstatsmonitor/cpu_collector_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package systemstatsmonitor

func (cc *cpuCollector) recordLoad() {
// not supported
}

func (cc *cpuCollector) recordSystemStats() {
// not supported
}
Loading

0 comments on commit 228f0f5

Please sign in to comment.