forked from PatWie/cluster-smi
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcluster.go
110 lines (86 loc) · 2.47 KB
/
cluster.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package main
import (
"github.com/minostauros/cluster-smi/cluster"
"github.com/minostauros/cluster-smi/nvml"
"github.com/minostauros/cluster-smi/proc"
"os"
"os/user"
"strconv"
"time"
)
// Cluster
func FetchCluster(c *cluster.Cluster) {
for i, _ := range c.Nodes {
FetchNode(&c.Nodes[i])
}
}
// Node
func InitNode(n *cluster.Node) {
name, err := os.Hostname()
if err != nil {
panic(err)
}
n.Name = name
n.Time = time.Now()
boot_time, _ := proc.BootTime()
n.BootTime = boot_time
n.ClockTicks = proc.ClockTicks()
devices, _ := nvml.GetDevices()
for i := 0; i < len(devices); i++ {
n.Devices = append(n.Devices, cluster.Device{0, "", 0, cluster.Memory{0, 0, 0, 0}, 0, 0, 0, nil})
}
}
func FetchNode(n *cluster.Node) {
devices, _ := nvml.GetDevices()
n.Time = time.Now()
boot_time, _ := proc.BootTime()
n.BootTime = boot_time
current_time := proc.TimeOfDay()
for idx, device := range devices {
meminfo, _ := device.GetMemoryInfo()
gpuPercent, _, _ := device.GetUtilization()
memPercent := int(meminfo.Used / meminfo.Total)
powerUsage, _ := device.GetPowerUsage()
fanSpeed, _ := device.GetFanSpeed()
tempc, _, _ := device.GetTemperature()
// read processes
deviceProcs, err := device.GetProcessInfo()
if err != nil {
panic(err)
}
// collect al proccess informations
var processes []cluster.Process
for i := 0; i < len(deviceProcs); i++ {
if int(deviceProcs[i].Pid) == 0 {
continue
}
PID := deviceProcs[i].Pid
pid_info := proc.InfoFromPid(PID)
UID := proc.UIDFromPID(PID)
user, err := user.LookupId(strconv.Itoa(UID))
username := "unknown"
if err == nil {
username = user.Username
}
containerName := proc.ContainerNameFromPID(PID)
extendedCMD := proc.CmdFromPID(PID)
processes = append(processes, cluster.Process{
Pid: PID,
UsedGpuMemory: deviceProcs[i].UsedGpuMemory,
Name: pid_info.Command,
Username: username,
ContainerName: containerName,
RunTime: (current_time - n.BootTime) - (pid_info.StartTime / n.ClockTicks),
ExtendedCommand: extendedCMD,
})
}
n.Devices[idx].Id = idx
n.Devices[idx].Name = device.DeviceName
n.Devices[idx].Utilization = gpuPercent
n.Devices[idx].MemoryUtilization = cluster.Memory{meminfo.Used, meminfo.Free, meminfo.Total, memPercent}
n.Devices[idx].FanSpeed = fanSpeed
n.Devices[idx].PowerUsage = powerUsage
n.Devices[idx].Temperature = tempc
n.Devices[idx].Processes = processes
}
}