Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for GPU accounting and fix parsing sinfo output #70

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"io/ioutil"
"os/exec"
"strings"
"strconv"
"strings"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)

type GPUsMetrics struct {
Expand All @@ -35,11 +36,14 @@ func GPUsGetMetrics() *GPUsMetrics {
return ParseGPUsMetrics()
}

func ParseAllocatedGPUs() float64 {
var num_gpus = 0.0

func getSacctData() []byte {
args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that in SLURM 20.11+ Allocgres is deprecated, consider changing to AllocTRES like in #34, or to make it version independent fist check if sacct --helpformat returns AllocGRES as a possible value.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am aware of that. However, since I don't have a slurm v20 cluster, I'm not risking changing the code myself to make it compatible with that version. Instead, I'm adding tests for the current code which should make it easier to add support for slurm 20 later (e.g. in another PR).

output := string(Execute("sacct", args))
return Execute("sacct", args)
}

func ParseAllocatedGPUs(sacct_output []byte) float64 {
var num_gpus = 0.0
output := string(sacct_output)
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
Expand All @@ -54,20 +58,29 @@ func ParseAllocatedGPUs() float64 {
return num_gpus
}

func ParseTotalGPUs() float64 {
var num_gpus = 0.0

func getSinfoData() []byte {
args := []string{"-h", "-o \"%n %G\""}
output := string(Execute("sinfo", args))
return Execute("sinfo", args)
}

func ParseTotalGPUs(sinfo_output []byte) float64 {
var num_gpus = 0.0
output := string(sinfo_output)
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
line = strings.Trim(line, "\"")
descriptor := strings.Fields(line)[1]
descriptor = strings.TrimPrefix(descriptor, "gpu:")
descriptor = strings.Split(descriptor, "(")[0]
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
num_gpus += node_gpus
gres := strings.Fields(line)[1]
// gres column format: comma-delimited list of resources
for _, resource := range strings.Split(gres, ",") {
if strings.HasPrefix(resource, "gpu:") {
// format: gpu:<type>:N(S:<something>), e.g. gpu:RTX2070:2(S:0)
descriptor := strings.Split(resource, ":")[2]
descriptor = strings.Split(descriptor, "(")[0]
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
num_gpus += node_gpus
}
}
}
}
}
Expand All @@ -77,8 +90,10 @@ func ParseTotalGPUs() float64 {

func ParseGPUsMetrics() *GPUsMetrics {
var gm GPUsMetrics
total_gpus := ParseTotalGPUs()
allocated_gpus := ParseAllocatedGPUs()
sinfo_output := getSinfoData()
total_gpus := ParseTotalGPUs(sinfo_output)
sacct_output := getSacctData()
allocated_gpus := ParseAllocatedGPUs(sacct_output)
gm.alloc = allocated_gpus
gm.idle = total_gpus - allocated_gpus
gm.total = total_gpus
Expand Down Expand Up @@ -111,9 +126,9 @@ func Execute(command string, arguments []string) []byte {

func NewGPUsCollector() *GPUsCollector {
return &GPUsCollector{
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
}
}
Expand Down
51 changes: 51 additions & 0 deletions gpus_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package main

import (
"io/ioutil"
"os"
"testing"
)

func TestParseTotalGPUs(t *testing.T) {
tests := []struct {
input string
want float64
}{
{"sinfo_gpus_19.txt", 15}, // slurm version 19
}
for _, test := range tests {
// Read the input data from a file
file, err := os.Open("test_data/" + test.input)
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
data, err := ioutil.ReadAll(file)
got := ParseTotalGPUs(data)
if got != test.want {
t.Fatalf("got %v; want %v for file %s", got, test.want, test.input)
}
t.Logf("%v %+v %f", test, data, got)
}
}

func TestParseAllocatedGPUs(t *testing.T) {
tests := []struct {
input string
want float64
}{
{"sacct_gpus_19.txt", 12}, // slurm version 19
}
for _, test := range tests {
// Read the input data from a file
file, err := os.Open("test_data/" + test.input)
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
data, err := ioutil.ReadAll(file)
got := ParseAllocatedGPUs(data)
if got != test.want {
t.Fatalf("got %v; want %v for file %s", got, test.want, test.input)
}
t.Logf("%v %+v %f", test, data, got)
}
}
13 changes: 13 additions & 0 deletions test_data/sacct_gpus_19.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
gpu:1
gpu:1
gpu:1
gpu:1
gpu:1
gpu:1
gpu:1
gpu:2
gpu:1


gpu:1
gpu:1
5 changes: 5 additions & 0 deletions test_data/sinfo_gpus_19.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
host-01 gpu:2080:2(S:0),gpu:8000:2(S:0)
host-02 gpu:2080:2(S:0),gpu:8000:2(S:0)
host-03 gpu:A4000:4(S:0)
host-04 gpu:1080:2(S:0)
host-05 gpu:2080:1(S:0)