Skip to content

Commit

Permalink
Merge pull request #3 from mahendrapaipuri/unique_jobid_label
Browse files Browse the repository at this point in the history
Add unique jobid label for SLURM jobs
  • Loading branch information
mahendrapaipuri authored Nov 17, 2023
2 parents f14f981 + 656e390 commit 2e6f5a0
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 65 deletions.
36 changes: 18 additions & 18 deletions collector/fixtures/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds for jobid
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds
# TYPE batchjob_cpu_system_seconds gauge
batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.45
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds for jobid
batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.45
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds
# TYPE batchjob_cpu_total_seconds gauge
batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task=""} 1.012410966
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds for jobid
batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.012410966
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds
# TYPE batchjob_cpu_user_seconds gauge
batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.39
# HELP batchjob_cpus Number of CPUs in the jobid
batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.39
# HELP batchjob_cpus Number of CPUs
# TYPE batchjob_cpus gauge
batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total 332
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.1086208e+07
batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.1086208e+07
# HELP batchjob_memory_fail_count Memory fail count
# TYPE batchjob_memory_fail_count gauge
batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memory_rss_bytes Memory RSS used in bytes
# TYPE batchjob_memory_rss_bytes gauge
batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task=""} 1.0407936e+07
# HELP batchjob_memory_total_bytes Memory total given to jobid in bytes
batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.0407936e+07
# HELP batchjob_memory_total_bytes Memory total in bytes
# TYPE batchjob_memory_total_bytes gauge
batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.01362030592e+11
batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.01362030592e+11
# HELP batchjob_memory_used_bytes Memory used in bytes
# TYPE batchjob_memory_used_bytes gauge
batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.0194048e+07
batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.0194048e+07
# HELP batchjob_memsw_fail_count Swap fail count
# TYPE batchjob_memsw_fail_count gauge
batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0
# HELP batchjob_memsw_total_bytes Swap total given to jobid in bytes
batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memsw_total_bytes Swap total in bytes
# TYPE batchjob_memsw_total_bytes gauge
batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 9.223372036854772e+18
batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 9.223372036854772e+18
# HELP batchjob_memsw_used_bytes Swap used in bytes
# TYPE batchjob_memsw_used_bytes gauge
batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.032512e+07
batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.032512e+07
# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU
# TYPE batchjob_nvidia_gpu_jobid gauge
batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000
Expand Down
36 changes: 18 additions & 18 deletions collector/fixtures/e2e-test-cgroupsv2-output.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds for jobid
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds
# TYPE batchjob_cpu_system_seconds gauge
batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task=""} 115.777502
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds for jobid
batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 115.777502
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds
# TYPE batchjob_cpu_total_seconds gauge
batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task=""} 60491.070351
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds for jobid
batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60491.070351
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds
# TYPE batchjob_cpu_user_seconds gauge
batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 60375.292848
# HELP batchjob_cpus Number of CPUs in the jobid
batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60375.292848
# HELP batchjob_cpus Number of CPUs
# TYPE batchjob_cpus gauge
batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 2
batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total 332
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memory_fail_count Memory fail count
# TYPE batchjob_memory_fail_count gauge
batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memory_rss_bytes Memory RSS used in bytes
# TYPE batchjob_memory_rss_bytes gauge
batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.098592768e+09
# HELP batchjob_memory_total_bytes Memory total given to jobid in bytes
batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.098592768e+09
# HELP batchjob_memory_total_bytes Memory total in bytes
# TYPE batchjob_memory_total_bytes gauge
batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.294967296e+09
batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.294967296e+09
# HELP batchjob_memory_used_bytes Memory used in bytes
# TYPE batchjob_memory_used_bytes gauge
batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.111491072e+09
batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.111491072e+09
# HELP batchjob_memsw_fail_count Swap fail count
# TYPE batchjob_memsw_fail_count gauge
batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0
# HELP batchjob_memsw_total_bytes Swap total given to jobid in bytes
batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memsw_total_bytes Swap total in bytes
# TYPE batchjob_memsw_total_bytes gauge
batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_memsw_used_bytes Swap used in bytes
# TYPE batchjob_memsw_used_bytes gauge
batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 0
batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0
# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU
# TYPE batchjob_nvidia_gpu_jobid gauge
batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000
Expand Down
1 change: 1 addition & 0 deletions collector/fixtures/slurmjobstat/1009248
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1000 1000 compute-[0-2] /home/user/slurm
77 changes: 51 additions & 26 deletions collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ var (
cgroupV2 = false
metricLock = sync.RWMutex{}
collectJobSteps = kingpin.Flag("collector.slurm.jobsteps.metrics", "Whether to collect metrics of all slurm job steps and tasks [WARNING: This option can result in very high cardinality of metrics].").Default("false").Bool()
useJobIdHash = kingpin.Flag("collector.slurm.unique.jobid", "Whether to calculate a hash based on job SLURM_JOBID, SLURM_JOB_UID, SLURM_JOB_GID, SLURM_JOB_NODELIST, SLURM_JOB_WORKDIR to get unique job identifier.").Default("false").Bool()
jobStatPath = kingpin.Flag("collector.slurm.job.stat.path", "Path to jobstat files that contains a file for each job with line \"$SLURM_JOB_UID $SLURM_JOB_GID $SLURM_JOB_NODELIST $SLURM_JOB_WORKDIR\". An MD5 checksum is computed on this file to get an unique job ID if --collector.slurm.unique.jobid is used.").Default("/run/slurmjobstat").String()
)

type CgroupMetric struct {
Expand All @@ -45,6 +47,7 @@ type CgroupMetric struct {
userslice bool
uid int
jobid string
ujobid string
step string
task string
batch string
Expand Down Expand Up @@ -84,31 +87,31 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
return &slurmCollector{
cgroupV2: cgroupV2,
cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"),
"Cumulative CPU user seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil),
"Cumulative CPU user seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "system_seconds"),
"Cumulative CPU system seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil),
"Cumulative CPU system seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
cpuTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "total_seconds"),
"Cumulative CPU total seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil),
"Cumulative CPU total seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"),
"Number of CPUs in the jobid", []string{"batch", "jobid", "step", "task"}, nil),
"Number of CPUs", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"),
"Memory RSS used in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Memory RSS used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"),
"Memory cache used in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Memory cache used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memoryUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "used_bytes"),
"Memory used in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Memory used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
"Memory total given to jobid in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Memory total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
"Memory fail count", []string{"batch", "jobid", "step", "task"}, nil),
"Memory fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memswUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "used_bytes"),
"Swap used in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Swap used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memswTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "total_bytes"),
"Swap total given to jobid in bytes", []string{"batch", "jobid", "step", "task"}, nil),
"Swap total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
memswFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "fail_count"),
"Swap fail count", []string{"batch", "jobid", "step", "task"}, nil),
"Swap fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"),
"Indicates collection error, 0=no error, 1=error", []string{"batch", "jobid", "step", "task"}, nil),
"Indicates collection error, 0=no error, 1=error", []string{"batch", "jobid", "ujobid", "step", "task"}, nil),
logger: logger,
}, nil
}
Expand All @@ -132,9 +135,9 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
if m.err {
ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name)
}
ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.ujobid, m.step, m.task)
cpus := m.cpus
if cpus == 0 {
dir := filepath.Dir(n)
Expand All @@ -143,15 +146,15 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
cpus = metrics[filepath.Dir(dir)].cpus
}
}
ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.ujobid, m.step, m.task)
ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task)
}
return nil
}
Expand Down Expand Up @@ -195,7 +198,7 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) {
metric, _ := c.getMetrics(n)
if !metric.err {
metricLock.Lock()
metrics[n] = metric
metrics[metric.jobid] = metric
metricLock.Unlock()
}
wg.Done()
Expand Down Expand Up @@ -292,6 +295,26 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) {
return cpus, nil
}

// Get job unique identifier from job metadata
func (c *slurmCollector) getJobUniqueId(jobid string) string {
var uniqueJobId string
var jobUid = ""
var jobGid = ""
var jobNodes = ""
var jobWorkDir = ""
var slurmJobInfo = fmt.Sprintf("%s/%s", *jobStatPath, jobid)
if _, err := os.Stat(slurmJobInfo); err == nil {
content, err := os.ReadFile(slurmJobInfo)
if err != nil {
level.Error(c.logger).Log("msg", "Failed to get metadata for job", "jobid", jobid, "err", err)
} else {
fmt.Sscanf(string(content), "%s %s %s %s", &jobUid, &jobGid, &jobNodes, &jobWorkDir)
}
uniqueJobId = GetMD5CheckSum([]string{jobid, jobUid, jobGid, jobNodes, jobWorkDir})
}
return uniqueJobId
}

// Get job details from cgroups v1
func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) {
var err error
Expand Down Expand Up @@ -366,6 +389,7 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error)
metric.cpus = len(cpus)
}
c.getInfoV1(name, &metric)
metric.ujobid = c.getJobUniqueId(metric.jobid)
return metric, nil
}

Expand Down Expand Up @@ -431,5 +455,6 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error)
metric.cpus = len(cpus)
}
c.getInfoV2(name, &metric)
metric.ujobid = c.getJobUniqueId(metric.jobid)
return metric, nil
}
4 changes: 2 additions & 2 deletions collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) {
if err != nil {
t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err)
}
if !reflect.DeepEqual(metrics["/system.slice/slurmstepd.scope/job_1009248"], expectedSlurmMetrics) {
if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) {
t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics)
}
}
Expand Down Expand Up @@ -76,7 +76,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) {
if err != nil {
t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err)
}
if !reflect.DeepEqual(metrics["/slurm/uid_1000/job_1009248"], expectedSlurmMetrics) {
if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) {
t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics)
}
}
Loading

0 comments on commit 2e6f5a0

Please sign in to comment.