From 3cccf7f6a4a431398a233538d12a025882059aa5 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Fri, 17 Nov 2023 15:37:38 +0100 Subject: [PATCH 1/2] feat: Add unique jobID to label * Compute unique ID based on job metadata Signed-off-by: mahendrapaipuri --- collector/slurm.go | 77 +++++++++++++++++++++++++++-------------- collector/slurm_test.go | 4 +-- collector/utils.go | 9 +++++ 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/collector/slurm.go b/collector/slurm.go index 294364db..6f341779 100644 --- a/collector/slurm.go +++ b/collector/slurm.go @@ -26,6 +26,8 @@ var ( cgroupV2 = false metricLock = sync.RWMutex{} collectJobSteps = kingpin.Flag("collector.slurm.jobsteps.metrics", "Whether to collect metrics of all slurm job steps and tasks [WARNING: This option can result in very high cardinality of metrics].").Default("false").Bool() + useJobIdHash = kingpin.Flag("collector.slurm.unique.jobid", "Whether to calculate a hash based on job SLURM_JOBID, SLURM_JOB_UID, SLURM_JOB_GID, SLURM_JOB_NODELIST, SLURM_JOB_WORKDIR to get unique job identifier.").Default("false").Bool() + jobStatPath = kingpin.Flag("collector.slurm.job.stat.path", "Path to jobstat files that contains a file for each job with line \"$SLURM_JOB_UID $SLURM_JOB_GID $SLURM_JOB_NODELIST $SLURM_JOB_WORKDIR\". An MD5 checksum is computed on this file to get an unique job ID if --collector.slurm.unique.jobid is used.").Default("/run/slurmjobstat").String() ) type CgroupMetric struct { @@ -45,6 +47,7 @@ type CgroupMetric struct { userslice bool uid int jobid string + ujobid string step string task string batch string @@ -84,31 +87,31 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { return &slurmCollector{ cgroupV2: cgroupV2, cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"), - "Cumulative CPU user seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil), + "Cumulative CPU user seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "system_seconds"), - "Cumulative CPU system seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil), + "Cumulative CPU system seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), cpuTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "total_seconds"), - "Cumulative CPU total seconds for jobid", []string{"batch", "jobid", "step", "task"}, nil), + "Cumulative CPU total seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"), - "Number of CPUs in the jobid", []string{"batch", "jobid", "step", "task"}, nil), + "Number of CPUs", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"), - "Memory RSS used in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Memory RSS used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"), - "Memory cache used in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Memory cache used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memoryUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "used_bytes"), - "Memory used in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Memory used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"), - "Memory total given to jobid in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Memory total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"), - "Memory fail count", []string{"batch", "jobid", "step", "task"}, nil), + "Memory fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memswUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "used_bytes"), - "Swap used in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Swap used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memswTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "total_bytes"), - "Swap total given to jobid in bytes", []string{"batch", "jobid", "step", "task"}, nil), + "Swap total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), memswFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "fail_count"), - "Swap fail count", []string{"batch", "jobid", "step", "task"}, nil), + "Swap fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"), - "Indicates collection error, 0=no error, 1=error", []string{"batch", "jobid", "step", "task"}, nil), + "Indicates collection error, 0=no error, 1=error", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), logger: logger, }, nil } @@ -132,9 +135,9 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { if m.err { ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name) } - ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) cpus := m.cpus if cpus == 0 { dir := filepath.Dir(n) @@ -143,15 +146,15 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { cpus = metrics[filepath.Dir(dir)].cpus } } - ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task) } return nil } @@ -195,7 +198,7 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) { metric, _ := c.getMetrics(n) if !metric.err { metricLock.Lock() - metrics[n] = metric + metrics[metric.jobid] = metric metricLock.Unlock() } wg.Done() @@ -292,6 +295,26 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) { return cpus, nil } +// Get job unique identifier from job metadata +func (c *slurmCollector) getJobUniqueId(jobid string) string { + var uniqueJobId string + var jobUid = "" + var jobGid = "" + var jobNodes = "" + var jobWorkDir = "" + var slurmJobInfo = fmt.Sprintf("%s/%s", *jobStatPath, jobid) + if _, err := os.Stat(slurmJobInfo); err == nil { + content, err := os.ReadFile(slurmJobInfo) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to get metadata for job", "jobid", jobid, "err", err) + } else { + fmt.Sscanf(string(content), "%s %s %s %s", &jobUid, &jobGid, &jobNodes, &jobWorkDir) + } + uniqueJobId = GetMD5CheckSum([]string{jobid, jobUid, jobGid, jobNodes, jobWorkDir}) + } + return uniqueJobId +} + // Get job details from cgroups v1 func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) { var err error @@ -366,6 +389,7 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) metric.cpus = len(cpus) } c.getInfoV1(name, &metric) + metric.ujobid = c.getJobUniqueId(metric.jobid) return metric, nil } @@ -431,5 +455,6 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) metric.cpus = len(cpus) } c.getInfoV2(name, &metric) + metric.ujobid = c.getJobUniqueId(metric.jobid) return metric, nil } diff --git a/collector/slurm_test.go b/collector/slurm_test.go index 64341ace..db43d970 100644 --- a/collector/slurm_test.go +++ b/collector/slurm_test.go @@ -41,7 +41,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { if err != nil { t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err) } - if !reflect.DeepEqual(metrics["/system.slice/slurmstepd.scope/job_1009248"], expectedSlurmMetrics) { + if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) } } @@ -76,7 +76,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { if err != nil { t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err) } - if !reflect.DeepEqual(metrics["/slurm/uid_1000/job_1009248"], expectedSlurmMetrics) { + if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) } } diff --git a/collector/utils.go b/collector/utils.go index 9926044a..60165139 100644 --- a/collector/utils.go +++ b/collector/utils.go @@ -1,6 +1,8 @@ package collector import ( + "crypto/md5" + "encoding/hex" "encoding/json" "fmt" "io" @@ -87,3 +89,10 @@ func GetRteEnergyMixData() (float64, error) { } return float64(fields[0].TauxCo2), nil } + +// Get md5 checksum for given slice of strings +func GetMD5CheckSum(stringSlice []string) string { + s := strings.Join(stringSlice[:], ",") + hash := md5.Sum([]byte(s)) + return hex.EncodeToString(hash[:]) +} From 656e390b35c9a07e57058b6e0c90213ea4e79015 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Fri, 17 Nov 2023 15:37:54 +0100 Subject: [PATCH 2/2] test: Update tests Signed-off-by: mahendrapaipuri --- .../fixtures/e2e-test-cgroupsv1-output.txt | 36 +++++++++---------- .../fixtures/e2e-test-cgroupsv2-output.txt | 36 +++++++++---------- collector/fixtures/slurmjobstat/1009248 | 1 + e2e-test.sh | 4 ++- 4 files changed, 40 insertions(+), 37 deletions(-) create mode 100644 collector/fixtures/slurmjobstat/1009248 diff --git a/collector/fixtures/e2e-test-cgroupsv1-output.txt b/collector/fixtures/e2e-test-cgroupsv1-output.txt index 07037c65..02c3733b 100644 --- a/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -1,15 +1,15 @@ -# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds for jobid +# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.45 -# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds for jobid +batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.45 +# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task=""} 1.012410966 -# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds for jobid +batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.012410966 +# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.39 -# HELP batchjob_cpus Number of CPUs in the jobid +batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.39 +# HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 0 batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.1086208e+07 +batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.1086208e+07 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task=""} 1.0407936e+07 -# HELP batchjob_memory_total_bytes Memory total given to jobid in bytes +batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.0407936e+07 +# HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.01362030592e+11 +batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.01362030592e+11 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.0194048e+07 +batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.0194048e+07 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 -# HELP batchjob_memsw_total_bytes Swap total given to jobid in bytes +batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +# HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 9.223372036854772e+18 +batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 9.223372036854772e+18 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.032512e+07 +batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.032512e+07 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/collector/fixtures/e2e-test-cgroupsv2-output.txt b/collector/fixtures/e2e-test-cgroupsv2-output.txt index aa37d90b..a46ba88e 100644 --- a/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/collector/fixtures/e2e-test-cgroupsv2-output.txt @@ -1,15 +1,15 @@ -# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds for jobid +# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task=""} 115.777502 -# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds for jobid +batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 115.777502 +# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task=""} 60491.070351 -# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds for jobid +batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60491.070351 +# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 60375.292848 -# HELP batchjob_cpus Number of CPUs in the jobid +batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60375.292848 +# HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 2 +batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 2 batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.098592768e+09 -# HELP batchjob_memory_total_bytes Memory total given to jobid in bytes +batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.098592768e+09 +# HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.294967296e+09 +batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.294967296e+09 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.111491072e+09 +batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.111491072e+09 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 -# HELP batchjob_memsw_total_bytes Swap total given to jobid in bytes +batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +# HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 0 +batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/collector/fixtures/slurmjobstat/1009248 b/collector/fixtures/slurmjobstat/1009248 new file mode 100644 index 00000000..4f0965a4 --- /dev/null +++ b/collector/fixtures/slurmjobstat/1009248 @@ -0,0 +1 @@ +1000 1000 compute-[0-2] /home/user/slurm diff --git a/e2e-test.sh b/e2e-test.sh index f30b332e..7dd0d6dc 100755 --- a/e2e-test.sh +++ b/e2e-test.sh @@ -12,7 +12,7 @@ skip_re="^(go_|batchjob_exporter_build_info|batchjob_scrape_collector_duration_s arch="$(uname -m)" cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) - +# cgroups_mode="legacy" echo "cgroups mode detected is ${cgroups_mode}" case "${cgroups_mode}" in @@ -53,6 +53,8 @@ fi PATH=$PWD/collector/fixtures:$PATH ./batchjob_exporter \ --path.sysfs="collector/fixtures/sys" \ --path.cgroupfs="collector/fixtures/sys/fs/cgroup" \ + --collector.slurm.unique.jobid \ + --collector.slurm.job.stat.path="collector/fixtures/slurmjobstat" \ --collector.ipmi.dcmi.wrapper.path="collector/fixtures/ipmi-dcmi-wrapper.sh" \ --collector.nvidia_gpu \ --collector.nvidia.gpu.stat.path="collector/fixtures/gpustat" \