diff --git a/Makefile b/Makefile index 4a90239e..00f54393 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,7 @@ test-e2e: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc ./scripts/e2e-test.sh -s exporter-cgroups-v1 ./scripts/e2e-test.sh -s exporter-cgroups-v1-memory-subsystem ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmiutil + ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-gpu-reordering ./scripts/e2e-test.sh -s exporter-cgroups-v2-amd-ipmitool ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs @@ -202,6 +203,7 @@ test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testda ./scripts/e2e-test.sh -s exporter-cgroups-v1 -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v1-memory-subsystem -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmiutil -u || true + ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-gpu-reordering -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-amd-ipmitool -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs -u || true diff --git a/README.md b/README.md index 9cbe8635..a1f4c796 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ in a resource manager agnostic way. - Monitor energy, performance, IO and network metrics for different types of resource managers (SLURM, Openstack, k8s) -- Support NVIDIA and AMD GPUs +- Support NVIDIA (MIG and vGPU) and AMD GPUs - Realtime access to metrics *via* Grafana dashboards - Access control to Prometheus datasource in Grafana - Stores aggregated metrics in a separate DB that can be retained for long time diff --git a/cmd/ceems_api_server/main_test.go b/cmd/ceems_api_server/main_test.go index 6b59a429..c366fd37 100644 --- a/cmd/ceems_api_server/main_test.go +++ b/cmd/ceems_api_server/main_test.go @@ -7,6 +7,8 @@ import ( "path/filepath" "testing" "time" + + "github.com/stretchr/testify/require" ) var binary, _ = filepath.Abs("../../bin/ceems_api_server") @@ -24,23 +26,17 @@ func TestBatchjobStatsExecutable(t *testing.T) { tmpSacctPath := tmpDir + "/sacct" sacctPath, err := filepath.Abs("../../pkg/api/testdata/sacct") - if err != nil { - t.Error(err) - } + require.NoError(t, err) err = os.Link(sacctPath, tmpSacctPath) - if err != nil { - t.Error(err) - } + require.NoError(t, err) usagestats := exec.Command( binary, "--web.listen-address", address, "--no-security.drop-privileges", ) - if err := runCommandAndTests(usagestats); err != nil { - t.Error(err) - } + require.NoError(t, runCommandAndTests(usagestats)) } func runCommandAndTests(cmd *exec.Cmd) error { diff --git a/cmd/ceems_exporter/main_test.go b/cmd/ceems_exporter/main_test.go index 6ab4a2f4..253d9b7a 100644 --- a/cmd/ceems_exporter/main_test.go +++ b/cmd/ceems_exporter/main_test.go @@ -11,6 +11,7 @@ import ( "time" "github.com/prometheus/procfs" + "github.com/stretchr/testify/require" ) var binary, _ = filepath.Abs("../../bin/ceems_exporter") @@ -37,14 +38,10 @@ func TestFileDescriptorLeak(t *testing.T) { } sysfsPath, err := filepath.Abs("../../pkg/collector/testdata/sys/fs/cgroup") - if err != nil { - t.Errorf("Failed to read testdata: %s", err) - } + require.NoError(t, err) procfsPath, err := filepath.Abs("../../pkg/collector/testdata/proc") - if err != nil { - t.Errorf("Failed to read testdata: %s", err) - } + require.NoError(t, err) exporter := exec.Command( binary, @@ -91,9 +88,7 @@ func TestFileDescriptorLeak(t *testing.T) { return nil } - if err := runCommandAndTests(exporter, address, test); err != nil { - t.Error(err) - } + require.NoError(t, runCommandAndTests(exporter, address, test)) } func queryExporter(address string) error { diff --git a/cmd/ceems_lb/main_test.go b/cmd/ceems_lb/main_test.go index aaa79ae5..09a2cedf 100644 --- a/cmd/ceems_lb/main_test.go +++ b/cmd/ceems_lb/main_test.go @@ -7,6 +7,8 @@ import ( "path/filepath" "testing" "time" + + "github.com/stretchr/testify/require" ) var binary, _ = filepath.Abs("../../bin/ceems_lb") @@ -24,14 +26,10 @@ func TestCEEMSLBExecutable(t *testing.T) { tmpConfigPath := tmpDir + "/config.yaml" configPath, err := filepath.Abs("../../build/config/ceems_lb/ceems_lb.yml") - if err != nil { - t.Error(err) - } + require.NoError(t, err) err = os.Link(configPath, tmpConfigPath) - if err != nil { - t.Error(err) - } + require.NoError(t, err) lb := exec.Command( binary, "--path.data", tmpDir, @@ -39,9 +37,7 @@ func TestCEEMSLBExecutable(t *testing.T) { "--web.listen-address", address, "--no-security.drop-privileges", ) - if err := runCommandAndTests(lb); err != nil { - t.Error(err) - } + require.NoError(t, runCommandAndTests(lb)) } func runCommandAndTests(cmd *exec.Cmd) error { diff --git a/pkg/collector/gpu.go b/pkg/collector/gpu.go new file mode 100644 index 00000000..c9180959 --- /dev/null +++ b/pkg/collector/gpu.go @@ -0,0 +1,550 @@ +package collector + +import ( + "encoding/xml" + "fmt" + "os" + "os/exec" + "regexp" + "slices" + "strconv" + "strings" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/internal/osexec" +) + +// Used for e2e tests. +var ( + gpuType = CEEMSExporterApp.Flag( + "collector.gpu.type", + "GPU device type. Currently only nvidia and amd devices are supported.", + ).Hidden().Enum("nvidia", "amd") + nvidiaSmiPath = CEEMSExporterApp.Flag( + "collector.gpu.nvidia-smi-path", + "Absolute path to nvidia-smi binary. Use only for testing.", + ).Hidden().Default("").String() + rocmSmiPath = CEEMSExporterApp.Flag( + "collector.gpu.rocm-smi-path", + "Absolute path to rocm-smi binary. Use only for testing.", + ).Hidden().Default("").String() +) + +// Regexes. +var ( + pciBusIDRegex = regexp.MustCompile(`(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+)\.(?P[0-9a-fA-F]+)`) + mdevRegexes = map[string]*regexp.Regexp{ + "pciAddr": regexp.MustCompile(`GPU ([a-fA-F0-9.:]+)`), + "mdevUUID": regexp.MustCompile(`^\s+MDEV UUID\s+: ([a-zA-Z0-9\-]+)`), + "gpuInstID": regexp.MustCompile(`^\s+GPU Instance ID\s+: ([0-9]+|N/A)`), + } +) + +// BusID is a struct that contains PCI bus address of GPU device. +type BusID struct { + domain uint64 + bus uint64 + device uint64 + function uint64 +} + +// String implements Stringer interface of the BusID struct. +func (b BusID) String() string { + return fmt.Sprintf( + "%s:%s:%s.%s", + strconv.FormatUint(b.domain, 16), + strconv.FormatUint(b.bus, 16), + strconv.FormatUint(b.device, 16), + strconv.FormatUint(b.function, 16), + ) +} + +// Compare compares the provided bus ID with current bus ID and +// returns true if they match and false in all other cases. +func (b *BusID) Compare(bTest BusID) bool { + // Check equality component per component in ID + if b.domain == bTest.domain && b.bus == bTest.bus && b.device == bTest.device && b.function == bTest.function { + return true + } else { + return false + } +} + +type Memory struct { + Total string `xml:"total"` +} + +type DeviceAttrsShared struct { + XMLName xml.Name `xml:"shared"` + SMCount uint64 `xml:"multiprocessor_count"` + CECount uint64 `xml:"copy_engine_count"` + EncCount uint64 `xml:"encoder_count"` + DecCount uint64 `xml:"decoder_count"` +} + +type DeviceAttrs struct { + XMLName xml.Name `xml:"device_attributes"` + Shared DeviceAttrsShared `xml:"shared"` +} + +type MIGDevice struct { + XMLName xml.Name `xml:"mig_device"` + Index uint64 `xml:"index"` + GPUInstID uint64 `xml:"gpu_instance_id"` + ComputeInstID uint64 `xml:"compute_instance_id"` + DeviceAttrs DeviceAttrs `xml:"device_attributes"` + FBMemory Memory `xml:"fb_memory_usage"` + Bar1Memory Memory `xml:"bar1_memory_usage"` +} + +type MIGDevices struct { + XMLName xml.Name `xml:"mig_devices"` + Devices []MIGDevice `xml:"mig_device"` +} + +type VirtMode struct { + XMLName xml.Name `xml:"gpu_virtualization_mode"` + Mode string `xml:"virtualization_mode"` + HostMode string `xml:"host_vgpu_mode"` +} + +type MIGMode struct { + XMLName xml.Name `xml:"mig_mode"` + CurrentMIG string `xml:"current_mig"` +} + +type GPU struct { + XMLName xml.Name `xml:"gpu"` + ID string `xml:"id,attr"` + ProductName string `xml:"product_name"` + ProductBrand string `xml:"product_brand"` + ProductArch string `xml:"product_architecture"` + MIGMode MIGMode `xml:"mig_mode"` + VirtMode VirtMode `xml:"gpu_virtualization_mode"` + MIGDevices MIGDevices `xml:"mig_devices"` + UUID string `xml:"uuid"` + MinorNumber string `xml:"minor_number"` +} + +type NVIDIASMILog struct { + XMLName xml.Name `xml:"nvidia_smi_log"` + GPUs []GPU `xml:"gpu"` +} + +type MIGInstance struct { + localIndex uint64 + globalIndex string + computeInstID uint64 + gpuInstID uint64 + smFraction float64 + mdevUUIDs []string +} + +// Device contains the details of GPU devices. +type Device struct { + localIndex string + globalIndex string + name string + uuid string + busID BusID + mdevUUIDs []string + migInstances []MIGInstance + migEnabled bool + vgpuEnabled bool +} + +// String implements Stringer interface of the Device struct. +func (d Device) String() string { + return fmt.Sprintf( + "name: %s; local_index: %s; global_index: %s; uuid: %s; bus_id: %s; "+ + "num_mdevs: %d; mig_enabled: %t; num_migs: %d; vgpu_enabled: %t", + d.name, d.localIndex, d.globalIndex, d.uuid, d.busID, + len(d.mdevUUIDs), d.migEnabled, len(d.migInstances), d.vgpuEnabled, + ) +} + +// CompareBusID compares the provided bus ID with device bus ID and +// returns true if they match and false in all other cases. +func (d *Device) CompareBusID(id string) bool { + // Parse bus id that needs to be compared + busID, err := parseBusID(id) + if err != nil { + return false + } + + // Check equality component per component in ID + return d.busID.Compare(busID) +} + +// GetGPUDevices returns GPU devices. +func GetGPUDevices(gpuType string, logger log.Logger) ([]Device, error) { + if gpuType == "nvidia" { + return GetNvidiaGPUDevices(logger) + } else if gpuType == "amd" { + return GetAMDGPUDevices(logger) + } + + return nil, fmt.Errorf("unknown GPU Type %s. Only nVIDIA and AMD GPU devices are supported", gpuType) +} + +// GetNvidiaGPUDevices returns all physical or MIG devices using nvidia-smi command +// Example output: +// bash-4.4$ nvidia-smi --query-gpu=name,uuid --format=csv +// name, uuid +// Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e +// Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 +// +// Here we are using nvidia-smi to avoid having build issues if we use +// nvml go bindings. This way we dont have deps on nvidia stuff and keep +// exporter simple. +// +// NOTE: This command does not return MIG devices. +func GetNvidiaGPUDevices(logger log.Logger) ([]Device, error) { + // Look up nvidia-smi command + nvidiaSmiCmd, err := lookupNvidiaSmiCmd() + if err != nil { + return nil, fmt.Errorf("failed to find nvidia-smi command: %w", err) + } + + // Execute nvidia-smi command to get available GPUs + args := []string{"--query", "--xml-format"} + + nvidiaSmiOutput, err := osexec.Execute(nvidiaSmiCmd, args, nil) + if err != nil { + return nil, err + } + + return parseNvidiaSmiOutput(nvidiaSmiOutput, logger) +} + +// GetAMDGPUDevices returns all GPU devices using rocm-smi command +// Example output: +// bash-4.4$ rocm-smi --showproductname --showserial --showbus --csv +// device,Serial Number,Card series,Card model,Card vendor,Card SKU +// card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +// card1,20170003580c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +// card2,20180003050c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317. +func GetAMDGPUDevices(logger log.Logger) ([]Device, error) { + // Look up nvidia-smi command + rocmSmiCmd, err := lookupRocmSmiCmd() + if err != nil { + return nil, fmt.Errorf("failed to find rocm-smi command: %w", err) + } + + // Execute nvidia-smi command to get available GPUs + args := []string{"--showproductname", "--showserial", "--showbus", "--csv"} + + rocmSmiOutput, err := osexec.Execute(rocmSmiCmd, args, nil) + if err != nil { + return nil, err + } + + return parseAmdSmioutput(string(rocmSmiOutput), logger), nil +} + +// lookupNvidiaSmiCmd checks if nvidia-smi path provided by CLI exists and falls back +// to `nvidia-smi` command on host. +func lookupNvidiaSmiCmd() (string, error) { + if *nvidiaSmiPath != "" { + if _, err := os.Stat(*nvidiaSmiPath); err != nil { + return "", err + } + + return *nvidiaSmiPath, nil + } else { + nvidiaSmiCmd := "nvidia-smi" + if _, err := exec.LookPath(nvidiaSmiCmd); err != nil { + return "", err + } else { + return nvidiaSmiCmd, nil + } + } +} + +// lookupRocmSmiCmd checks if rocm-smi path provided by CLI exists and falls back +// to `rocm-smi` command on host. +func lookupRocmSmiCmd() (string, error) { + if *rocmSmiPath != "" { + if _, err := os.Stat(*rocmSmiPath); err != nil { + return "", err + } + + return *rocmSmiPath, nil + } else { + rocmSmiCmd := "rocm-smi" + if _, err := exec.LookPath(rocmSmiCmd); err != nil { + return "", err + } else { + return rocmSmiCmd, nil + } + } +} + +// parseNvidiaSmiOutput parses nvidia-smi output and return GPU Devices map. +func parseNvidiaSmiOutput(cmdOutput []byte, logger log.Logger) ([]Device, error) { + // Get all devices + var gpuDevices []Device + + // Read XML byte array into gpu + var nvidiaSMILog NVIDIASMILog + if err := xml.Unmarshal(cmdOutput, &nvidiaSMILog); err != nil { + return nil, err + } + + // NOTE: Ensure that we sort the devices using PCI address + // Seems like nvidia-smi most of the times returns them in correct order. + var globalIndex uint64 + + for igpu, gpu := range nvidiaSMILog.GPUs { + var err error + + dev := Device{ + localIndex: strconv.FormatInt(int64(igpu), 10), + uuid: gpu.UUID, + name: fmt.Sprintf("%s %s %s", gpu.ProductName, gpu.ProductBrand, gpu.ProductArch), + } + + // Parse bus ID + dev.busID, err = parseBusID(gpu.ID) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", gpu.ID, "err", err) + } + + // Check MIG stats + if gpu.MIGMode.CurrentMIG == "Enabled" { + dev.migEnabled = true + } else { + dev.globalIndex = strconv.FormatUint(globalIndex, 10) + globalIndex++ + } + + // If MIG is enabled, get all MIG devices + var totalSMs float64 + + var migDevs []MIGInstance + + for _, mig := range gpu.MIGDevices.Devices { + migDev := MIGInstance{ + localIndex: mig.Index, + globalIndex: strconv.FormatUint(globalIndex, 10), + computeInstID: mig.ComputeInstID, + gpuInstID: mig.GPUInstID, + } + + totalSMs += float64(mig.DeviceAttrs.Shared.SMCount) + + migDevs = append(migDevs, migDev) + globalIndex++ + } + + // Now we have total SMs get fraction for each instance. + // We will use it for splitting total power between instances. + for imig, mig := range gpu.MIGDevices.Devices { + migDevs[imig].smFraction = float64(mig.DeviceAttrs.Shared.SMCount) / totalSMs + } + + dev.migInstances = migDevs + + // Check vGPU status + // Options can be VGPU and VSGA. VSGA is some vSphere stuff so we + // dont worry about it. Only check if mode is VGPU + if gpu.VirtMode.Mode == "VGPU" { + dev.vgpuEnabled = true + } + + gpuDevices = append(gpuDevices, dev) + level.Debug(logger).Log("msg", "Found nVIDIA GPU", "gpu", dev) + } + + return gpuDevices, nil +} + +// parseAmdSmioutput parses rocm-smi output and return AMD devices. +func parseAmdSmioutput(cmdOutput string, logger log.Logger) []Device { + var gpuDevices []Device + + for _, line := range strings.Split(strings.TrimSpace(cmdOutput), "\n") { + // Header line, empty line and newlines are ignored + if line == "" || line == "\n" || strings.HasPrefix(line, "device") { + continue + } + + devDetails := strings.Split(line, ",") + if len(devDetails) < 7 { + continue + } + + // Get device index, name and UUID + devIndx := strings.TrimPrefix(devDetails[0], "card") + devUUID := strings.TrimSpace(devDetails[1]) + devBusID := strings.TrimSpace(devDetails[2]) + devName := strings.TrimSpace(devDetails[3]) + + // Parse bus ID + busID, err := parseBusID(devBusID) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", devBusID, "err", err) + } + + dev := Device{localIndex: devIndx, globalIndex: devIndx, name: devName, uuid: devUUID, busID: busID, migEnabled: false} + level.Debug(logger).Log("msg", "Found AMD GPU", "gpu", dev) + + gpuDevices = append(gpuDevices, dev) + } + + return gpuDevices +} + +// reindexGPUs reindexes GPU globalIndex based on orderMap string. +func reindexGPUs(orderMap string, devs []Device) []Device { + for _, gpuMap := range strings.Split(orderMap, ",") { + orderMap := strings.Split(gpuMap, ":") + if len(orderMap) < 2 { + continue + } + + // Check if MIG instance ID is present + devIndx := strings.Split(orderMap[1], ".") + for idev := range devs { + if devs[idev].localIndex == strings.TrimSpace(devIndx[0]) { + if len(devIndx) == 2 { + for imig := range devs[idev].migInstances { + if strconv.FormatUint(devs[idev].migInstances[imig].gpuInstID, 10) == strings.TrimSpace(devIndx[1]) { + devs[idev].migInstances[imig].globalIndex = strings.TrimSpace(orderMap[0]) + + break + } + } + } else { + devs[idev].globalIndex = strings.TrimSpace(orderMap[0]) + } + } + } + } + + return devs +} + +// updateGPUMdevs updates GPU devices slice with mdev UUIDs. +func updateGPUMdevs(devs []Device) ([]Device, error) { + // Look up nvidia-smi command + nvidiaSmiCmd, err := lookupNvidiaSmiCmd() + if err != nil { + return nil, fmt.Errorf("failed to find nvidia-smi command: %w", err) + } + + // Execute command + stdOut, err := osexec.Execute(nvidiaSmiCmd, []string{"vgpu", "--query"}, nil) + if err != nil { + return nil, fmt.Errorf("failed to execute nvidia-smi vgpu command: %w", err) + } + + vGPUQueryOut := string(stdOut) + + // Get all GPU addresses + allGPUs := mdevRegexes["pciAddr"].FindAllString(vGPUQueryOut, -1) + + // Split all lines + lines := strings.Split(vGPUQueryOut, "\n") + + // Get range of lines for each GPU + var gpuIndx []int + + for iline, line := range lines { + if slices.Contains(allGPUs, line) { + gpuIndx = append(gpuIndx, iline) + } + } + + gpuIndx = append(gpuIndx, len(lines)) + + // For each GPU output extract GPU addr, mdev UUID and GPU instance ID + gpuMdevs := make(map[string][][]string) + + for indx := range len(gpuIndx) - 1 { + var addr string + + var uuids, instIDs []string + + for i := gpuIndx[indx]; i < gpuIndx[indx+1]; i++ { + if matches := mdevRegexes["pciAddr"].FindStringSubmatch(lines[i]); len(matches) > 1 { + addr = strings.TrimSpace(matches[1]) + } + + if matches := mdevRegexes["mdevUUID"].FindStringSubmatch(lines[i]); len(matches) > 1 { + uuids = append(uuids, strings.TrimSpace(matches[1])) + } + + if matches := mdevRegexes["gpuInstID"].FindStringSubmatch(lines[i]); len(matches) > 1 { + instIDs = append(instIDs, strings.TrimSpace(matches[1])) + } + } + + for imdev := range uuids { + gpuMdevs[addr] = append(gpuMdevs[addr], []string{uuids[imdev], instIDs[imdev]}) + } + } + + // Loop over each mdev and add them to devs + for addr, mdevs := range gpuMdevs { + // Loop over all devs to find GPU that has same busID + for idev := range devs { + if devs[idev].CompareBusID(addr) { + // Remove existing mdevs + if devs[idev].migEnabled { + for imig := range devs[idev].migInstances { + devs[idev].migInstances[imig].mdevUUIDs = make([]string, 0) + } + } else { + devs[idev].mdevUUIDs = make([]string, 0) + } + + for _, mdev := range mdevs { + // If MIG is enabled, loop over all MIG instances to compare instance ID + if devs[idev].migEnabled { + for imig := range devs[idev].migInstances { + if strconv.FormatUint(devs[idev].migInstances[imig].gpuInstID, 10) == mdev[1] { + // Ensure not to duplicate mdevs + if !slices.Contains(devs[idev].migInstances[imig].mdevUUIDs, mdev[0]) { + devs[idev].migInstances[imig].mdevUUIDs = append(devs[idev].migInstances[imig].mdevUUIDs, mdev[0]) + } + } + } + } else { + if !slices.Contains(devs[idev].mdevUUIDs, mdev[0]) { + devs[idev].mdevUUIDs = append(devs[idev].mdevUUIDs, mdev[0]) + } + } + } + } + } + } + + return devs, nil +} + +// parseBusID parses PCIe bus ID string to BusID struct. +func parseBusID(id string) (BusID, error) { + // Bus ID is in form of ::. + matches := pciBusIDRegex.FindStringSubmatch(id) + + var values []uint64 + + for i, match := range matches { + if i != 0 { + value, err := strconv.ParseUint(match, 16, 16) + if err != nil { + return BusID{}, err + } + + values = append(values, value) + } + } + + if len(values) == 4 { + return BusID{domain: values[0], bus: values[1], device: values[2], function: values[3]}, nil + } + + return BusID{}, fmt.Errorf("error parsing PCIe bus ID: %s", id) +} diff --git a/pkg/collector/gpu_test.go b/pkg/collector/gpu_test.go new file mode 100644 index 00000000..61976469 --- /dev/null +++ b/pkg/collector/gpu_test.go @@ -0,0 +1,648 @@ +package collector + +import ( + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/go-kit/log" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func getExpectedNvidiaDevs() []Device { + return []Device{ + { + localIndex: "0", + globalIndex: "0", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", + busID: BusID{domain: 0x0, bus: 0x10, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: true, + }, + { + localIndex: "1", + globalIndex: "1", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x15, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "2", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", + busID: BusID{domain: 0x0, bus: 0x21, device: 0x0, function: 0x0}, + migInstances: []MIGInstance{ + {localIndex: 0x0, globalIndex: "2", computeInstID: 0x0, gpuInstID: 0x1, smFraction: 0.6}, + {localIndex: 0x1, globalIndex: "3", computeInstID: 0x0, gpuInstID: 0x5, smFraction: 0.2}, + {localIndex: 0x2, globalIndex: "4", computeInstID: 0x0, gpuInstID: 0xd, smFraction: 0.2}, + }, + migEnabled: true, + vgpuEnabled: true, + }, + { + localIndex: "3", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", + busID: BusID{domain: 0x0, bus: 0x81, device: 0x0, function: 0x0}, + migInstances: []MIGInstance{ + {localIndex: 0x0, globalIndex: "5", computeInstID: 0x0, gpuInstID: 0x1, smFraction: 0.5714285714285714}, + {localIndex: 0x1, globalIndex: "6", computeInstID: 0x0, gpuInstID: 0x5, smFraction: 0.2857142857142857}, + {localIndex: 0x2, globalIndex: "7", computeInstID: 0x0, gpuInstID: 0x6, smFraction: 0.14285714285714285}, + }, + migEnabled: true, + vgpuEnabled: true, + }, + { + localIndex: "4", + globalIndex: "8", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x83, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "5", + globalIndex: "9", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x85, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: true, + }, + { + localIndex: "6", + globalIndex: "10", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-1d4d0f3e-b51a-4040-96e3-bf380f7c5728", + busID: BusID{domain: 0x0, bus: 0x87, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "7", + globalIndex: "11", + name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", + uuid: "GPU-6cc98505-fdde-461e-a93c-6935fba45a27", + busID: BusID{domain: 0x0, bus: 0x89, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + } +} + +func getExpectedAmdDevs() []Device { + return []Device{ + { + localIndex: "0", + globalIndex: "0", + name: "deon Instinct MI50 32GB", + uuid: "20170000800c", + busID: BusID{domain: 0x0, bus: 0xc5, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "1", + globalIndex: "1", + name: "deon Instinct MI50 32GB", + uuid: "20170003580c", + busID: BusID{domain: 0x0, bus: 0xc8, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "2", + globalIndex: "2", + name: "deon Instinct MI50 32GB", + uuid: "20180003050c", + busID: BusID{domain: 0x0, bus: 0x8a, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + { + localIndex: "3", + globalIndex: "3", + name: "deon Instinct MI50 32GB", + uuid: "20170005280c", + busID: BusID{domain: 0x0, bus: 0x8d, device: 0x0, function: 0x0}, + migEnabled: false, + vgpuEnabled: false, + }, + } +} + +func TestParseNvidiaSmiOutput(t *testing.T) { + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", + }, + ) + require.NoError(t, err) + + gpuDevices, err := GetNvidiaGPUDevices(log.NewNopLogger()) + require.NoError(t, err) + assert.Equal(t, getExpectedNvidiaDevs(), gpuDevices) +} + +func TestNvidiaMIGAtLowerAddr(t *testing.T) { + nvidiaSmiLog := ` + + + Fri Oct 11 18:24:09 2024 + 535.129.03 + 12.2 + 2 + + + Enabled + Enabled + + + + 0 + 1 + 0 + + + 1 + 5 + 0 + + + 2 + 5 + 0 + + + + + + N/A + N/A + + + None + + +` + tempDir := t.TempDir() + nvidiaSMIPath := filepath.Join(tempDir, "nvidia-smi") + content := fmt.Sprintf(`#!/bin/bash +echo """%s""" +`, nvidiaSmiLog) + os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec + + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.nvidia-smi-path", nvidiaSMIPath, + }, + ) + require.NoError(t, err) + + gpuDevices, err := GetNvidiaGPUDevices(log.NewNopLogger()) + require.NoError(t, err) + + // Check if globalIndex for GPU 0 is empty and GPU 1 is 3 + assert.Empty(t, gpuDevices[0].globalIndex) + assert.Equal(t, "3", gpuDevices[1].globalIndex) +} + +func TestNvidiaMIGAtHigherAddr(t *testing.T) { + nvidiaSmiLog := ` + + + Fri Oct 11 18:24:09 2024 + 535.129.03 + 12.2 + 2 + + + N/A + N/A + + + None + + + + + Enabled + Enabled + + + + 0 + 1 + 0 + + + 1 + 5 + 0 + + + 2 + 5 + 0 + + + +` + tempDir := t.TempDir() + nvidiaSMIPath := filepath.Join(tempDir, "nvidia-smi") + content := fmt.Sprintf(`#!/bin/bash +echo """%s""" +`, nvidiaSmiLog) + os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec + + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.nvidia-smi-path", nvidiaSMIPath, + }, + ) + require.NoError(t, err) + + gpuDevices, err := GetNvidiaGPUDevices(log.NewNopLogger()) + require.NoError(t, err) + + // Check if globalIndex for GPU 1 is empty and GPU 0 is 0 + assert.Empty(t, gpuDevices[1].globalIndex) + assert.Equal(t, "0", gpuDevices[0].globalIndex) +} + +func TestParseAmdSmiOutput(t *testing.T) { + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.rocm-smi-path", "testdata/rocm-smi", + }, + ) + require.NoError(t, err) + gpuDevices, err := GetAMDGPUDevices(log.NewNopLogger()) + require.NoError(t, err) + assert.Equal(t, getExpectedAmdDevs(), gpuDevices) +} + +func TestReindexGPUs(t *testing.T) { + testCases := []struct { + name string + devs []Device + expectedDevs []Device + orderMap string + }{ + { + devs: []Device{ + { + globalIndex: "", + localIndex: "0", + migInstances: []MIGInstance{ + { + globalIndex: "0", + gpuInstID: 3, + }, + { + globalIndex: "1", + gpuInstID: 5, + }, + { + globalIndex: "2", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + { + globalIndex: "1", + localIndex: "1", + }, + }, + expectedDevs: []Device{ + { + globalIndex: "", + localIndex: "0", + migInstances: []MIGInstance{ + { + globalIndex: "1", + gpuInstID: 3, + }, + { + globalIndex: "2", + gpuInstID: 5, + }, + { + globalIndex: "3", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + { + globalIndex: "0", + localIndex: "1", + }, + }, + orderMap: "0:1,1:0.3,2:0.5,3:0.9", + }, + { + devs: []Device{ + { + globalIndex: "0", + localIndex: "0", + }, + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "1", + gpuInstID: 3, + }, + { + globalIndex: "2", + gpuInstID: 5, + }, + { + globalIndex: "3", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + }, + expectedDevs: []Device{ + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "0", + gpuInstID: 3, + }, + { + globalIndex: "1", + gpuInstID: 5, + }, + { + globalIndex: "2", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + { + globalIndex: "3", + localIndex: "0", + }, + }, + orderMap: "0:1.3,1:1.5,2:1.9,3:0", + }, + { + devs: []Device{ + { + globalIndex: "0", + localIndex: "0", + }, + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "1", + gpuInstID: 3, + }, + { + globalIndex: "2", + gpuInstID: 5, + }, + { + globalIndex: "3", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + { + globalIndex: "4", + localIndex: "2", + }, + }, + expectedDevs: []Device{ + { + globalIndex: "0", + localIndex: "0", + }, + { + globalIndex: "1", + localIndex: "2", + }, + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "2", + gpuInstID: 3, + }, + { + globalIndex: "3", + gpuInstID: 5, + }, + { + globalIndex: "4", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + }, + orderMap: "0:0,1:2,2:1.3,3:1.5,4:1.9", + }, + { + devs: []Device{ + { + globalIndex: "0", + localIndex: "0", + }, + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "1", + gpuInstID: 3, + }, + { + globalIndex: "2", + gpuInstID: 5, + }, + { + globalIndex: "3", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + { + globalIndex: "4", + localIndex: "2", + }, + }, + expectedDevs: []Device{ + { + globalIndex: "0", + localIndex: "0", + }, + { + globalIndex: "1", + localIndex: "2", + }, + { + globalIndex: "", + localIndex: "1", + migInstances: []MIGInstance{ + { + globalIndex: "2", + gpuInstID: 3, + }, + { + globalIndex: "3", + gpuInstID: 5, + }, + { + globalIndex: "4", + gpuInstID: 9, + }, + }, + migEnabled: true, + }, + }, + orderMap: "0:0,1:2,2:1.3,3:1.5,4:1.9,5:3,6:3.3", + }, + } + + for itc, tc := range testCases { + newDevs := reindexGPUs(tc.orderMap, tc.devs) + assert.ElementsMatch(t, tc.expectedDevs, newDevs, "Case %d", itc) + } +} + +func TestUpdateMdevs(t *testing.T) { + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", + }, + ) + require.NoError(t, err) + + gpuDevices, err := GetNvidiaGPUDevices(log.NewNopLogger()) + require.NoError(t, err) + + expectedDevs := []Device{ + { + localIndex: "0", globalIndex: "0", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", + busID: BusID{domain: 0x0, bus: 0x10, device: 0x0, function: 0x0}, + mdevUUIDs: []string{"c73f1fa6-489e-4834-9476-d70dabd98c40", "f9702ffa-fa28-414e-a52f-e7831fd5ce41"}, + migEnabled: false, vgpuEnabled: true, + }, + { + localIndex: "1", globalIndex: "1", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x15, device: 0x0, function: 0x0}, + migEnabled: false, vgpuEnabled: false, + }, + { + localIndex: "2", globalIndex: "", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", + busID: BusID{domain: 0x0, bus: 0x21, device: 0x0, function: 0x0}, + migInstances: []MIGInstance{ + {localIndex: 0x0, globalIndex: "2", computeInstID: 0x0, gpuInstID: 0x1, smFraction: 0.6, mdevUUIDs: []string{"f0f4b97c-6580-48a6-ae1b-a807d6dfe08f"}}, + {localIndex: 0x1, globalIndex: "3", computeInstID: 0x0, gpuInstID: 0x5, smFraction: 0.2, mdevUUIDs: []string{"3b356d38-854e-48be-b376-00c72c7d119c", "5bb3bad7-ce3b-4aa5-84d7-b5b33cf9d45e"}}, + {localIndex: 0x2, globalIndex: "4", computeInstID: 0x0, gpuInstID: 0xd, smFraction: 0.2, mdevUUIDs: []string{}}, + }, + migEnabled: true, vgpuEnabled: true, + }, + { + localIndex: "3", globalIndex: "", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", + busID: BusID{domain: 0x0, bus: 0x81, device: 0x0, function: 0x0}, + migInstances: []MIGInstance{ + {localIndex: 0x0, globalIndex: "5", computeInstID: 0x0, gpuInstID: 0x1, smFraction: 0.5714285714285714, mdevUUIDs: []string{"4f84d324-5897-48f3-a4ef-94c9ddf23d78"}}, + {localIndex: 0x1, globalIndex: "6", computeInstID: 0x0, gpuInstID: 0x5, smFraction: 0.2857142857142857, mdevUUIDs: []string{"3058eb95-0899-4c3d-90e9-e20b6c14789f"}}, + {localIndex: 0x2, globalIndex: "7", computeInstID: 0x0, gpuInstID: 0x6, smFraction: 0.14285714285714285, mdevUUIDs: []string{"9f0d5993-9778-40c7-a721-3fec93d6b3a9"}}, + }, + migEnabled: true, vgpuEnabled: true, + }, + { + localIndex: "4", globalIndex: "8", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x83, device: 0x0, function: 0x0}, + migEnabled: false, vgpuEnabled: false, + }, + { + localIndex: "5", globalIndex: "9", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x85, device: 0x0, function: 0x0}, + mdevUUIDs: []string{"64c3c4ae-44e1-45b8-8d46-5f76a1fa9824"}, + migEnabled: false, vgpuEnabled: true, + }, + { + localIndex: "6", globalIndex: "10", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-1d4d0f3e-b51a-4040-96e3-bf380f7c5728", + busID: BusID{domain: 0x0, bus: 0x87, device: 0x0, function: 0x0}, + migEnabled: false, vgpuEnabled: false, + }, + { + localIndex: "7", globalIndex: "11", name: "NVIDIA A100-PCIE-40GB NVIDIA Ampere", uuid: "GPU-6cc98505-fdde-461e-a93c-6935fba45a27", + busID: BusID{domain: 0x0, bus: 0x89, device: 0x0, function: 0x0}, + migEnabled: false, vgpuEnabled: false, + }, + } + + // Now updates gpuDevices with mdevs + updatedGPUDevs, err := updateGPUMdevs(gpuDevices) + require.NoError(t, err) + assert.EqualValues(t, expectedDevs, updatedGPUDevs) +} + +func TestParseBusIDPass(t *testing.T) { + id := "00000000:AD:00.0" + busID, err := parseBusID(id) + require.NoError(t, err) + + expectedID := BusID{domain: 0x0, bus: 0xad, device: 0x0, function: 0x0} + + assert.Equal(t, expectedID, busID) +} + +func TestParseBusIDFail(t *testing.T) { + // Missing component + id := "00000000:AD:00" + _, err := parseBusID(id) + require.Error(t, err) + + // Malformed ID + id = "00000000:AD:00:4" + _, err = parseBusID(id) + require.Error(t, err) + + // Not Hex + id = "ggggggg:AD:00:0" + _, err = parseBusID(id) + require.Error(t, err) +} + +func TestCompareBusIDs(t *testing.T) { + // Sample Device + d := Device{busID: BusID{domain: 0x0, bus: 0xad, device: 0x0, function: 0x0}} + + // Test ID - pass + id := "00000000:AD:00.0" + assert.True(t, d.CompareBusID(id)) + + // Test ID - fail + id = "00000000:AD:0A.0" + assert.False(t, d.CompareBusID(id)) + + // Test ID - error fail + id = "00000000:AD:00" + assert.False(t, d.CompareBusID(id)) +} diff --git a/pkg/collector/helper.go b/pkg/collector/helper.go index 4923b7fe..41af0275 100644 --- a/pkg/collector/helper.go +++ b/pkg/collector/helper.go @@ -4,88 +4,18 @@ import ( "errors" "fmt" "os" - "os/exec" "path/filepath" "regexp" "strconv" "strings" "syscall" - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/mahendrapaipuri/ceems/internal/osexec" "github.com/prometheus/procfs" ) -type BusID struct { - domain uint64 - bus uint64 - slot uint64 - function uint64 -} - -// Compare compares the provided bus ID with current bus ID and -// returns true if they match and false in all other cases. -func (b *BusID) Compare(bTest BusID) bool { - // Check equality component per component in ID - if b.domain == bTest.domain && b.bus == bTest.bus && b.slot == bTest.slot && b.function == bTest.function { - return true - } else { - return false - } -} - -// Device contains the details of GPU devices. -type Device struct { - index string - name string - uuid string - busID BusID - isMig bool - isvGPU bool -} - -// String implements Stringer interface of the Device struct. -func (d Device) String() string { - return fmt.Sprintf( - "name: %s; index: %s; uuid: %s; bus_id: %v; is_mig_instance: %t; is_vgpu_instance: %t", - d.name, d.index, d.uuid, d.busID, d.isMig, d.isvGPU, - ) -} - -// CompareBusID compares the provided bus ID with device bus ID and -// returns true if they match and false in all other cases. -func (d *Device) CompareBusID(id string) bool { - // Parse bus id that needs to be compared - busID, err := parseBusID(id) - if err != nil { - return false - } - - // Check equality component per component in ID - return d.busID.Compare(busID) -} - var ( metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`) reParens = regexp.MustCompile(`\((.*)\)`) - pciBusIDRegex = regexp.MustCompile(`(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+)\.(?P[0-9a-fA-F]+)`) -) - -// Used for e2e tests. -var ( - gpuType = CEEMSExporterApp.Flag( - "collector.gpu.type", - "GPU device type. Currently only nvidia and amd devices are supported.", - ).Hidden().Enum("nvidia", "amd") - nvidiaSmiPath = CEEMSExporterApp.Flag( - "collector.gpu.nvidia-smi-path", - "Absolute path to nvidia-smi binary. Use only for testing.", - ).Hidden().Default("").String() - rocmSmiPath = CEEMSExporterApp.Flag( - "collector.gpu.rocm-smi-path", - "Absolute path to rocm-smi binary. Use only for testing.", - ).Hidden().Default("").String() ) // SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores. @@ -103,176 +33,6 @@ func SanitizeMetricName(metricName string) string { return metricNameRegex.ReplaceAllString(metricName, "_") } -// GetGPUDevices returns GPU devices. -func GetGPUDevices(gpuType string, logger log.Logger) (map[int]Device, error) { - if gpuType == "nvidia" { - return GetNvidiaGPUDevices(*nvidiaSmiPath, logger) - } else if gpuType == "amd" { - return GetAMDGPUDevices(*rocmSmiPath, logger) - } - - return nil, fmt.Errorf("unknown GPU Type %s. Only nVIDIA and AMD GPU devices are supported", gpuType) -} - -// Parse nvidia-smi output and return GPU Devices map. -func parseNvidiaSmiOutput(cmdOutput string, logger log.Logger) map[int]Device { - // Get all devices - gpuDevices := map[int]Device{} - devIndxInt := 0 - - for _, line := range strings.Split(strings.TrimSpace(cmdOutput), "\n") { - // Header line, empty line and newlines are ignored - if line == "" || line == "\n" || strings.HasPrefix(line, "index") { - continue - } - - devDetails := strings.Split(line, ",") - if len(devDetails) < 4 { - continue - } - - // Get device index, name and UUID - devIndx := strings.TrimSpace(devDetails[0]) - devName := strings.TrimSpace(devDetails[1]) - devUUID := strings.TrimSpace(devDetails[2]) - devBusID := strings.TrimSpace(devDetails[3]) - - // Parse bus ID - busID, err := parseBusID(devBusID) - if err != nil { - level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", devBusID, "err", err) - } - - // Check if device is in MiG mode - isMig := false - if strings.HasPrefix(devUUID, "MIG") { - isMig = true - } - - gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, busID: busID, isMig: isMig} - level.Debug(logger).Log("msg", "Found nVIDIA GPU", "gpu", gpuDevices[devIndxInt]) - - devIndxInt++ - } - - return gpuDevices -} - -// GetNvidiaGPUDevices returns all physical or MIG devices using nvidia-smi command -// Example output: -// bash-4.4$ nvidia-smi --query-gpu=name,uuid --format=csv -// name, uuid -// Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -// Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 -// -// Here we are using nvidia-smi to avoid having build issues if we use -// nvml go bindings. This way we dont have deps on nvidia stuff and keep -// exporter simple. -// -// NOTE: This command does not return MIG devices. -func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Device, error) { - // Check if nvidia-smi binary exists - var nvidiaSmiCmd string - - if nvidiaSmiPath != "" { - if _, err := os.Stat(nvidiaSmiPath); err != nil { - return nil, err - } - - nvidiaSmiCmd = nvidiaSmiPath - } else { - nvidiaSmiCmd = "nvidia-smi" - if _, err := exec.LookPath(nvidiaSmiCmd); err != nil { - return nil, err - } - } - - // Execute nvidia-smi command to get available GPUs - args := []string{"--query-gpu=index,name,uuid,gpu_bus_id", "--format=csv"} - - nvidiaSmiOutput, err := osexec.Execute(nvidiaSmiCmd, args, nil) - if err != nil { - return nil, err - } - - return parseNvidiaSmiOutput(string(nvidiaSmiOutput), logger), nil -} - -func parseAmdSmioutput(cmdOutput string, logger log.Logger) map[int]Device { - gpuDevices := map[int]Device{} - devIndxInt := 0 - - for _, line := range strings.Split(strings.TrimSpace(cmdOutput), "\n") { - // Header line, empty line and newlines are ignored - if line == "" || line == "\n" || strings.HasPrefix(line, "device") { - continue - } - - devDetails := strings.Split(line, ",") - if len(devDetails) < 7 { - continue - } - - // Get device index, name and UUID - devIndx := strings.TrimPrefix(devDetails[0], "card") - devUUID := strings.TrimSpace(devDetails[1]) - devBusID := strings.TrimSpace(devDetails[2]) - devName := strings.TrimSpace(devDetails[3]) - - // Parse bus ID - busID, err := parseBusID(devBusID) - if err != nil { - level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", devBusID, "err", err) - } - - // Set isMig to false as it does not apply for AMD GPUs - isMig := false - - gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, busID: busID, isMig: isMig} - level.Debug(logger).Log("msg", "Found AMD GPU", "gpu", gpuDevices[devIndxInt]) - - devIndxInt++ - } - - return gpuDevices -} - -// GetAMDGPUDevices returns all GPU devices using rocm-smi command -// Example output: -// bash-4.4$ rocm-smi --showproductname --showserial --showbus --csv -// device,Serial Number,Card series,Card model,Card vendor,Card SKU -// card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -// card1,20170003580c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -// card2,20180003050c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317. -func GetAMDGPUDevices(rocmSmiPath string, logger log.Logger) (map[int]Device, error) { - // Check if rocm-smi binary exists - var rocmSmiCmd string - - if rocmSmiPath != "" { - if _, err := os.Stat(rocmSmiPath); err != nil { - return nil, err - } - - rocmSmiCmd = rocmSmiPath - } else { - rocmSmiCmd = "rocm-smi" - - if _, err := exec.LookPath(rocmSmiCmd); err != nil { - return nil, err - } - } - - // Execute nvidia-smi command to get available GPUs - args := []string{"--showproductname", "--showserial", "--showbus", "--csv"} - - rocmSmiOutput, err := osexec.Execute(rocmSmiCmd, args, nil) - if err != nil { - return nil, err - } - - return parseAmdSmioutput(string(rocmSmiOutput), logger), nil -} - // cgroupProcs returns a map of active cgroups and processes contained in each cgroup. func cgroupProcs(fs procfs.FS, idRegex *regexp.Regexp, targetEnvVars []string, procFilter func(string) bool) (map[string][]procfs.Proc, error) { // Get all active procs @@ -391,7 +151,7 @@ func lookPath(f string) (string, error) { } } - return "", errors.New("path does not exist") + return "", errors.New("file does not exist") } // inode returns the inode of a given path. @@ -409,31 +169,6 @@ func inode(path string) (uint64, error) { return stat.Ino, nil } -// parseBusID parses PCIe bus ID string to BusID struct. -func parseBusID(id string) (BusID, error) { - // Bus ID is in form of ::. - matches := pciBusIDRegex.FindStringSubmatch(id) - - var values []uint64 - - for i, match := range matches { - if i != 0 { - value, err := strconv.ParseUint(match, 16, 16) - if err != nil { - return BusID{}, err - } - - values = append(values, value) - } - } - - if len(values) == 4 { - return BusID{domain: values[0], bus: values[1], slot: values[2], function: values[3]}, nil - } - - return BusID{}, fmt.Errorf("error parsing PCIe bus ID: %s", id) -} - // unescapeString sanitizes the string by unescaping UTF-8 characters. func unescapeString(s string) (string, error) { sanitized, err := strconv.Unquote("\"" + s + "\"") diff --git a/pkg/collector/helper_test.go b/pkg/collector/helper_test.go index fa0b3ef4..1434b78f 100644 --- a/pkg/collector/helper_test.go +++ b/pkg/collector/helper_test.go @@ -3,161 +3,44 @@ package collector import ( - "fmt" - "os" "path/filepath" "testing" - "github.com/go-kit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -var ( - expectedNvidiaSmiOutput = `index, name, uuid, bus_id -0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e, 00000000:07:00.0 -1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3, 00000000:0B:00.0 -2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3, 00000000:48:00.0 -3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3, 00000000:4C:00.0` - expectedAmdSmiOutput = `device,Serial Number,PCI Bus,Card series,Card model,Card vendor,Card SKU -card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card1,20170003580c,0000:C8:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card2,20180003050c,0000:8A:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card3,20170005280c,0000:8D:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317` -) - -func getExpectedNvidiaDevs() map[int]Device { - nvidiaDevs := make(map[int]Device, 4) - nvidiaDevs[0] = Device{ - index: "0", - name: "Tesla V100-SXM2-32GB", - uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", - busID: BusID{domain: 0x0, bus: 0x7, slot: 0x0, function: 0x0}, - isMig: false, - } - nvidiaDevs[1] = Device{ - index: "1", - name: "Tesla V100-SXM2-32GB", - uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", - busID: BusID{domain: 0x0, bus: 0xb, slot: 0x0, function: 0x0}, - isMig: false, - } - nvidiaDevs[2] = Device{ - index: "2", - name: "Tesla V100-SXM2-32GB", - uuid: "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", - busID: BusID{domain: 0x0, bus: 0x48, slot: 0x0, function: 0x0}, - isMig: false, - } - nvidiaDevs[3] = Device{ - index: "3", - name: "Tesla V100-SXM2-32GB", - uuid: "GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3", - busID: BusID{domain: 0x0, bus: 0x4c, slot: 0x0, function: 0x0}, - isMig: false, - } - - return nvidiaDevs -} - -func getExpectedAmdDevs() map[int]Device { - amdDevs := make(map[int]Device, 4) - amdDevs[0] = Device{ - index: "0", - name: "deon Instinct MI50 32GB", - uuid: "20170000800c", - busID: BusID{domain: 0x0, bus: 0xc5, slot: 0x0, function: 0x0}, - isMig: false, - } - amdDevs[1] = Device{ - index: "1", - name: "deon Instinct MI50 32GB", - uuid: "20170003580c", - busID: BusID{domain: 0x0, bus: 0xc8, slot: 0x0, function: 0x0}, - isMig: false, - } - amdDevs[2] = Device{ - index: "2", - name: "deon Instinct MI50 32GB", - uuid: "20180003050c", - busID: BusID{domain: 0x0, bus: 0x8a, slot: 0x0, function: 0x0}, - isMig: false, +func TestSantizeMetricName(t *testing.T) { + testCases := []struct { + input string + expected string + }{ + { + input: "metric-name", + expected: "metric_name", + }, + { + input: "metric-name$", + expected: "metric_name_", + }, + { + input: "ns/metric-name", + expected: "ns_metric_name", + }, + } + + for _, tc := range testCases { + got := SanitizeMetricName(tc.input) + assert.Equal(t, tc.expected, got) } - amdDevs[3] = Device{ - index: "3", - name: "deon Instinct MI50 32GB", - uuid: "20170005280c", - busID: BusID{domain: 0x0, bus: 0x8d, slot: 0x0, function: 0x0}, - isMig: false, - } - - return amdDevs } -func TestParseNvidiaSmiOutput(t *testing.T) { - tempDir := t.TempDir() - nvidiaSMIPath := filepath.Join(tempDir, "nvidia-smi") - content := fmt.Sprintf(`#!/bin/bash -echo """%s""" -`, expectedNvidiaSmiOutput) - os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec - gpuDevices, err := GetNvidiaGPUDevices(nvidiaSMIPath, log.NewNopLogger()) +func TestInode(t *testing.T) { + absPath, err := filepath.Abs("testdata") require.NoError(t, err) - assert.Equal(t, gpuDevices, getExpectedNvidiaDevs()) -} -func TestParseAmdSmiOutput(t *testing.T) { - tempDir := t.TempDir() - amdSMIPath := filepath.Join(tempDir, "amd-smi") - content := fmt.Sprintf(`#!/bin/bash -echo """%s""" -`, expectedAmdSmiOutput) - os.WriteFile(amdSMIPath, []byte(content), 0o700) // #nosec - gpuDevices, err := GetAMDGPUDevices(amdSMIPath, log.NewNopLogger()) + inodeValue, err := inode(absPath) require.NoError(t, err) - assert.Equal(t, gpuDevices, getExpectedAmdDevs()) -} - -func TestParseBusIDPass(t *testing.T) { - id := "00000000:AD:00.0" - busID, err := parseBusID(id) - require.NoError(t, err) - - expectedID := BusID{domain: 0x0, bus: 0xad, slot: 0x0, function: 0x0} - - assert.Equal(t, expectedID, busID) -} - -func TestParseBusIDFail(t *testing.T) { - // Missing component - id := "00000000:AD:00" - _, err := parseBusID(id) - require.Error(t, err) - - // Malformed ID - id = "00000000:AD:00:4" - _, err = parseBusID(id) - require.Error(t, err) - - // Not Hex - id = "ggggggg:AD:00:0" - _, err = parseBusID(id) - require.Error(t, err) -} - -func TestCompareBusIDs(t *testing.T) { - // Sample Device - d := Device{busID: BusID{domain: 0x0, bus: 0xad, slot: 0x0, function: 0x0}} - - // Test ID - pass - id := "00000000:AD:00.0" - assert.True(t, d.CompareBusID(id)) - - // Test ID - fail - id = "00000000:AD:0A.0" - assert.False(t, d.CompareBusID(id)) - // Test ID - error fail - id = "00000000:AD:00" - assert.False(t, d.CompareBusID(id)) + assert.Positive(t, inodeValue) } diff --git a/pkg/collector/libvirt.go b/pkg/collector/libvirt.go index 19a0d476..68365791 100644 --- a/pkg/collector/libvirt.go +++ b/pkg/collector/libvirt.go @@ -14,6 +14,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/go-kit/log" "github.com/go-kit/log/level" @@ -95,7 +96,7 @@ type Address struct { type libvirtReadXMLSecurityCtxData struct { xmlPath string instanceID string - devices map[int]Device + devices []Device instanceProps instanceProps } @@ -112,18 +113,21 @@ type libvirtMetrics struct { } type libvirtCollector struct { - logger log.Logger - cgroupManager *cgroupManager - cgroupCollector *cgroupCollector - perfCollector *perfCollector - ebpfCollector *ebpfCollector - rdmaCollector *rdmaCollector - hostname string - gpuDevs map[int]Device - instanceGpuFlag *prometheus.Desc - collectError *prometheus.Desc - instancePropsCache map[string]instanceProps - securityContexts map[string]*security.SecurityContext + logger log.Logger + cgroupManager *cgroupManager + cgroupCollector *cgroupCollector + perfCollector *perfCollector + ebpfCollector *ebpfCollector + rdmaCollector *rdmaCollector + hostname string + gpuDevs []Device + vGPUActivated bool + instanceGpuFlag *prometheus.Desc + collectError *prometheus.Desc + instancePropsCache map[string]instanceProps + instancePropsCacheTTL time.Duration + instancePropslastUpdateTime time.Time + securityContexts map[string]*security.SecurityContext } func init() { @@ -196,7 +200,7 @@ func NewLibvirtCollector(logger log.Logger) (Collector, error) { // Attempt to get GPU devices var gpuTypes []string - var gpuDevs map[int]Device + var gpuDevs []Device if *gpuType != "" { gpuTypes = []string{*gpuType} @@ -213,6 +217,17 @@ func NewLibvirtCollector(logger log.Logger) (Collector, error) { } } + // Check if vGPU is activated on atleast one GPU + vGPUActivated := false + + for _, gpu := range gpuDevs { + if gpu.vgpuEnabled { + vGPUActivated = true + + break + } + } + // Setup necessary capabilities. These are the caps we need to read // XML files in /etc/libvirt/qemu folder that contains GPU devs used by guests. caps := setupCollectorCaps(logger, libvirtCollectorSubsystem, []string{"cap_dac_read_search"}) @@ -226,18 +241,21 @@ func NewLibvirtCollector(logger log.Logger) (Collector, error) { } return &libvirtCollector{ - cgroupManager: cgroupManager, - cgroupCollector: cgCollector, - perfCollector: perfCollector, - ebpfCollector: ebpfCollector, - rdmaCollector: rdmaCollector, - hostname: hostname, - gpuDevs: gpuDevs, - instancePropsCache: make(map[string]instanceProps), - securityContexts: map[string]*security.SecurityContext{libvirtReadXMLCtx: securityCtx}, + cgroupManager: cgroupManager, + cgroupCollector: cgCollector, + perfCollector: perfCollector, + ebpfCollector: ebpfCollector, + rdmaCollector: rdmaCollector, + hostname: hostname, + gpuDevs: gpuDevs, + vGPUActivated: vGPUActivated, + instancePropsCache: make(map[string]instanceProps), + instancePropsCacheTTL: 3 * time.Hour, + instancePropslastUpdateTime: time.Now(), + securityContexts: map[string]*security.SecurityContext{libvirtReadXMLCtx: securityCtx}, instanceGpuFlag: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "unit_gpu_index_flag"), - "Indicates running instance on GPU, 1=instance running", + "A value > 0 indicates running instance using current GPU", []string{ "manager", "hostname", @@ -369,16 +387,57 @@ func (c *libvirtCollector) updateGPUOrdinals(ch chan<- prometheus.Metric, instan for _, p := range instanceProps { // GPU instance mapping for _, gpuOrdinal := range p.gpuOrdinals { - var gpuuuid string + var gpuuuid, miggid string + + flagValue := float64(1) // Check the int index of devices where gpuOrdinal == dev.index for _, dev := range c.gpuDevs { - if gpuOrdinal == dev.index { + // If the device has MIG enabled loop over them as well + for _, mig := range dev.migInstances { + if gpuOrdinal == mig.globalIndex { + gpuuuid = dev.uuid + miggid = strconv.FormatUint(mig.gpuInstID, 10) + + // For MIG, we export SM fraction as flag value + // For vGPU enabled GPUs this fraction must be + // further divided by number of active vGPU instances + if dev.vgpuEnabled && len(mig.mdevUUIDs) > 1 { + flagValue = mig.smFraction / float64(len(mig.mdevUUIDs)) + } else { + flagValue = mig.smFraction + } + + goto update_chan + } + } + + if gpuOrdinal == dev.globalIndex { gpuuuid = dev.uuid - break + if dev.vgpuEnabled && len(dev.mdevUUIDs) > 1 { + flagValue = 1.0 / float64(len(dev.mdevUUIDs)) + } + + goto update_chan } } - ch <- prometheus.MustNewConstMetric(c.instanceGpuFlag, prometheus.GaugeValue, float64(1), c.cgroupManager.manager, c.hostname, p.uuid, gpuOrdinal, fmt.Sprintf("%s-gpu-%s", c.hostname, gpuOrdinal), gpuuuid) + + update_chan: + // We set label of gpuuuid of format / + // On the DCGM side, we need to use relabel magic to merge UUID + // and GPU_I_ID labels and set them exactly as / + // as well + ch <- prometheus.MustNewConstMetric( + c.instanceGpuFlag, + prometheus.GaugeValue, + flagValue, + c.cgroupManager.manager, + c.hostname, + p.uuid, + gpuOrdinal, + fmt.Sprintf("%s/gpu-%s", c.hostname, gpuOrdinal), + fmt.Sprintf("%s/%s", gpuuuid, miggid), + ) } } } @@ -394,6 +453,15 @@ func (c *libvirtCollector) discoverCgroups() (libvirtMetrics, error) { instanceIDUUIDMap := make(map[string]string) + // It is possible from Openstack to resize instances by changing flavour. It means + // it is possible to add GPUs to non-GPU instances, so we need to invalidate + // instancePropsCache once in a while to ensure we capture any changes in instance + // flavours + if time.Since(c.instancePropslastUpdateTime) > c.instancePropsCacheTTL { + c.instancePropsCache = make(map[string]instanceProps) + c.instancePropslastUpdateTime = time.Now() + } + // Walk through all cgroups and get cgroup paths // https://goplay.tools/snippet/coVDkIozuhg if err := filepath.WalkDir(c.cgroupManager.mountPoint, func(p string, info fs.DirEntry, err error) error { @@ -475,6 +543,14 @@ func (c *libvirtCollector) discoverCgroups() (libvirtMetrics, error) { // instanceProperties returns instance properties parsed from XML file. func (c *libvirtCollector) instanceProperties(instanceID string) instanceProps { + // If vGPU is activated on atleast one GPU, update mdevs + if c.vGPUActivated { + if updatedGPUDevs, err := updateGPUMdevs(c.gpuDevs); err == nil { + c.gpuDevs = updatedGPUDevs + level.Debug(c.logger).Log("msg", "GPU mdevs updated") + } + } + // Read XML file in a security context that raises necessary capabilities dataPtr := &libvirtReadXMLSecurityCtxData{ xmlPath: *libvirtXMLDir, @@ -539,25 +615,49 @@ func readLibvirtXMLFile(data interface{}) error { if hostDev.Type == "pci" { gpuBusID := fmt.Sprintf( "%s:%s:%s.%s", - strings.TrimPrefix(hostDev.Address.Domain, "0x"), - strings.TrimPrefix(hostDev.Address.Bus, "0x"), - strings.TrimPrefix(hostDev.Address.Slot, "0x"), - strings.TrimPrefix(hostDev.Address.Function, "0x"), + strings.TrimPrefix(hostDev.Source.Address.Domain, "0x"), + strings.TrimPrefix(hostDev.Source.Address.Bus, "0x"), + strings.TrimPrefix(hostDev.Source.Address.Slot, "0x"), + strings.TrimPrefix(hostDev.Source.Address.Function, "0x"), ) // Check if the current Bus ID matches with any existing GPUs - for idx, dev := range d.devices { + for _, dev := range d.devices { if dev.CompareBusID(gpuBusID) { - gpuOrdinals = append(gpuOrdinals, strconv.FormatInt(int64(idx), 10)) + gpuOrdinals = append(gpuOrdinals, dev.globalIndex) break } } + } else if hostDev.Type == "mdev" { + mdevUUID := hostDev.Source.Address.UUID + + // Check which GPU has this mdev UUID + for _, dev := range d.devices { + if dev.migEnabled { + for _, mig := range dev.migInstances { + if slices.Contains(mig.mdevUUIDs, mdevUUID) { + gpuOrdinals = append(gpuOrdinals, mig.globalIndex) + + break + } + } + } else { + if slices.Contains(dev.mdevUUIDs, mdevUUID) { + gpuOrdinals = append(gpuOrdinals, dev.globalIndex) + + break + } + } + } } } // Read instance properties into dataPointer - d.instanceProps = instanceProps{uuid: domain.UUID, gpuOrdinals: gpuOrdinals} + d.instanceProps = instanceProps{ + uuid: domain.UUID, + gpuOrdinals: gpuOrdinals, + } return nil } diff --git a/pkg/collector/libvirt_test.go b/pkg/collector/libvirt_test.go index 73ea5e16..efe1b98a 100644 --- a/pkg/collector/libvirt_test.go +++ b/pkg/collector/libvirt_test.go @@ -10,6 +10,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/containerd/cgroups/v3" "github.com/go-kit/log" @@ -27,6 +28,7 @@ func TestNewLibvirtCollector(t *testing.T) { "--path.sysfs", "testdata/sys", "--collector.libvirt.swap-memory-metrics", "--collector.libvirt.psi-metrics", + "--collector.libvirt.xml-dir", "testdata/qemu", "--collector.perf.hardware-events", "--collector.rdma.stats", "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", @@ -62,6 +64,7 @@ func TestLibvirtInstanceProps(t *testing.T) { "--path.cgroupfs", "testdata/sys/fs/cgroup", "--collector.libvirt.xml-dir", "testdata/qemu", "--collector.cgroups.force-version", "v2", + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", }, ) require.NoError(t, err) @@ -76,14 +79,25 @@ func TestLibvirtInstanceProps(t *testing.T) { }, } + noOpLogger := log.NewNopLogger() + + gpuDevs, err := GetGPUDevices("nvidia", noOpLogger) + require.NoError(t, err) + c := libvirtCollector{ - gpuDevs: mockGPUDevices(), - logger: log.NewNopLogger(), - cgroupManager: cgManager, - instancePropsCache: make(map[string]instanceProps), - securityContexts: make(map[string]*security.SecurityContext), + gpuDevs: gpuDevs, + logger: noOpLogger, + cgroupManager: cgManager, + vGPUActivated: true, + instancePropsCache: make(map[string]instanceProps), + instancePropsCacheTTL: 500 * time.Millisecond, + instancePropslastUpdateTime: time.Now(), + securityContexts: make(map[string]*security.SecurityContext), } + // Last update time + lastUpdateTime := c.instancePropslastUpdateTime + // Add dummy security context c.securityContexts[libvirtReadXMLCtx], err = security.NewSecurityContext( libvirtReadXMLCtx, @@ -93,23 +107,26 @@ func TestLibvirtInstanceProps(t *testing.T) { ) require.NoError(t, err) - expectedProps := instanceProps{ - gpuOrdinals: []string{"0", "1"}, - uuid: "57f2d45e-8ddf-4338-91df-62d0044ff1b5", + expectedProps := []instanceProps{ + {uuid: "57f2d45e-8ddf-4338-91df-62d0044ff1b5", gpuOrdinals: []string{"1", "8"}}, + {uuid: "b674a0a2-c300-4dc6-8c9c-65df16da6d69", gpuOrdinals: []string{"0", "3"}}, + {uuid: "2896bdd5-dbc2-4339-9d8e-ddd838bf35d3", gpuOrdinals: []string{"11", "9"}}, + {uuid: "4de89c5b-50d7-4d30-a630-14e135380fe8", gpuOrdinals: []string(nil)}, } metrics, err := c.discoverCgroups() require.NoError(t, err) - var gotProps instanceProps + assert.EqualValues(t, expectedProps, metrics.instanceProps) - for _, props := range metrics.instanceProps { - if props.uuid == expectedProps.uuid { - gotProps = props - } - } + // Sleep for 0.5 seconds to ensure we invalidate cache + time.Sleep(500 * time.Millisecond) - assert.Equal(t, expectedProps, gotProps) + _, err = c.discoverCgroups() + require.NoError(t, err) + + // Now check if lastUpdateTime is less than 0.5 se + assert.Greater(t, c.instancePropslastUpdateTime.Sub(lastUpdateTime), 500*time.Millisecond) } func TestInstancePropsCaching(t *testing.T) { @@ -127,6 +144,7 @@ func TestInstancePropsCaching(t *testing.T) { []string{ "--path.cgroupfs", cgroupsPath, "--collector.libvirt.xml-dir", xmlFilePath, + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", }, ) require.NoError(t, err) @@ -142,13 +160,20 @@ func TestInstancePropsCaching(t *testing.T) { }, } - mockGPUDevs := mockGPUDevices() + noOpLogger := log.NewNopLogger() + + gpuDevs, err := GetGPUDevices("nvidia", noOpLogger) + require.NoError(t, err) + c := libvirtCollector{ - cgroupManager: cgManager, - logger: log.NewNopLogger(), - gpuDevs: mockGPUDevs, - instancePropsCache: make(map[string]instanceProps), - securityContexts: make(map[string]*security.SecurityContext), + cgroupManager: cgManager, + logger: noOpLogger, + gpuDevs: gpuDevs, + vGPUActivated: true, + instancePropsCache: make(map[string]instanceProps), + instancePropsCacheTTL: 500 * time.Millisecond, + instancePropslastUpdateTime: time.Now(), + securityContexts: make(map[string]*security.SecurityContext), } // Add dummy security context @@ -168,27 +193,35 @@ func TestInstancePropsCaching(t *testing.T) { require.NoError(t, err) } - // Binds GPUs to first n jobs - for igpu := range mockGPUDevs { + // Binds GPUs to first n instances + var iInstance int + + var fullGPUInstances []string + + for _, dev := range gpuDevs { xmlContentPH := ` instance-%[1]d %[1]d -
+
-
` - xmlContent := fmt.Sprintf(xmlContentPH, igpu, strconv.FormatUint(mockGPUDevs[igpu].busID.bus, 16)) - err = os.WriteFile( - fmt.Sprintf("%s/instance-0000000%d.xml", xmlFilePath, igpu), - []byte(xmlContent), - 0o600, - ) - require.NoError(t, err) + if !dev.vgpuEnabled && !dev.migEnabled { + xmlContent := fmt.Sprintf(xmlContentPH, iInstance, strconv.FormatUint(dev.busID.bus, 16)) + err = os.WriteFile( + fmt.Sprintf("%s/instance-0000000%d.xml", xmlFilePath, iInstance), + []byte(xmlContent), + 0o600, + ) + require.NoError(t, err) + + fullGPUInstances = append(fullGPUInstances, dev.globalIndex) + iInstance++ + } } // Now call get metrics which should populate instancePropsCache @@ -198,9 +231,9 @@ func TestInstancePropsCaching(t *testing.T) { // Check if instancePropsCache has 20 instances and GPU ordinals are correct assert.Len(t, c.instancePropsCache, 20) - for igpu := range mockGPUDevs { - gpuIDString := strconv.FormatInt(int64(igpu), 10) - assert.Equal(t, []string{gpuIDString}, c.instancePropsCache["instance-0000000"+gpuIDString].gpuOrdinals) + for i, gpuIDString := range fullGPUInstances { + instanceIDString := strconv.FormatInt(int64(i), 10) + assert.Equal(t, []string{gpuIDString}, c.instancePropsCache["instance-0000000"+instanceIDString].gpuOrdinals) } // Remove first 10 instances and add new 10 more instances diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index e3638748..824ec525 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -11,6 +11,7 @@ import ( "os" "path/filepath" "slices" + "strconv" "strings" "sync" @@ -45,11 +46,16 @@ var ( "Enables collection of PSI metrics (default: disabled)", ).Default("false").Bool() - // Generic. + // GPU opts. + slurmGPUOrdering = CEEMSExporterApp.Flag( + "collector.slurm.gpu-order-map", + `GPU order mapping between SLURM and NVIDIA SMI/ROCm SMI tools. +It should be of format : [.] delimited by ",".`, + ).Default("").PlaceHolder("0:1,1:0.3,2:0.4,3:0.5,4:0.6").String() slurmGPUStatPath = CEEMSExporterApp.Flag( "collector.slurm.gpu-job-map-path", - "Path to file that maps GPU ordinals to job IDs.", - ).Default("/run/gpujobmap").String() + "Path to directory that maps GPU ordinals to job IDs.", + ).Default("").String() ) // Security context names. @@ -89,7 +95,7 @@ type slurmCollector struct { ebpfCollector *ebpfCollector rdmaCollector *rdmaCollector hostname string - gpuDevs map[int]Device + gpuDevs []Device procFS procfs.FS jobGpuFlag *prometheus.Desc collectError *prometheus.Desc @@ -178,7 +184,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { // Attempt to get GPU devices var gpuTypes []string - var gpuDevs map[int]Device + var gpuDevs []Device if *gpuType != "" { gpuTypes = []string{*gpuType} @@ -195,6 +201,13 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { } } + // Correct GPU ordering based on CLI flag when provided + if *slurmGPUOrdering != "" { + gpuDevs = reindexGPUs(*slurmGPUOrdering, gpuDevs) + + level.Debug(logger).Log("msg", "GPU reindexed based") + } + // Instantiate a new Proc FS procFS, err := procfs.NewFS(*procfsPath) if err != nil { @@ -228,7 +241,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { securityContexts: map[string]*security.SecurityContext{slurmReadProcCtx: securityCtx}, jobGpuFlag: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "unit_gpu_index_flag"), - "Indicates running job on GPU, 1=job running", + "A value > 0 indicates the job using current GPU", []string{ "manager", "hostname", @@ -360,16 +373,47 @@ func (c *slurmCollector) updateGPUOrdinals(ch chan<- prometheus.Metric, jobProps for _, p := range jobProps { // GPU job mapping for _, gpuOrdinal := range p.gpuOrdinals { - var gpuuuid string + var gpuuuid, miggid string + + flagValue := float64(1) // Check the int index of devices where gpuOrdinal == dev.index for _, dev := range c.gpuDevs { - if gpuOrdinal == dev.index { + // If the device has MIG enabled loop over them as well + for _, mig := range dev.migInstances { + if gpuOrdinal == mig.globalIndex { + gpuuuid = dev.uuid + miggid = strconv.FormatUint(mig.gpuInstID, 10) + + // For MIG, we export SM fraction as flag value + flagValue = mig.smFraction + + goto update_chan + } + } + + if gpuOrdinal == dev.globalIndex { gpuuuid = dev.uuid - break + goto update_chan } } - ch <- prometheus.MustNewConstMetric(c.jobGpuFlag, prometheus.GaugeValue, float64(1), c.cgroupManager.manager, c.hostname, p.uuid, gpuOrdinal, fmt.Sprintf("%s-gpu-%s", c.hostname, gpuOrdinal), gpuuuid) + + update_chan: + // We set label of gpuuuid of format / + // On the DCGM side, we need to use relabel magic to merge UUID + // and GPU_I_ID labels and set them exactly as / + // as well + ch <- prometheus.MustNewConstMetric( + c.jobGpuFlag, + prometheus.GaugeValue, + flagValue, + c.cgroupManager.manager, + c.hostname, + p.uuid, + gpuOrdinal, + fmt.Sprintf("%s/gpu-%s", c.hostname, gpuOrdinal), + fmt.Sprintf("%s/%s", gpuuuid, miggid), + ) } } } @@ -456,10 +500,43 @@ func (c *slurmCollector) discoverCgroups() (slurmMetrics, error) { return slurmMetrics{cgMetrics: cgMetrics, jobProps: jProps}, nil } +// readGPUMapFile reads file created by prolog script to retrieve job ID of a given GPU. +func (c *slurmCollector) readGPUMapFile(index string) string { + gpuJobMapInfo := fmt.Sprintf("%s/%s", *slurmGPUStatPath, index) + + // NOTE: Look for file name with UUID as it will be more appropriate with + // MIG instances. + // If /run/gpustat/0 file is not found, check for the file with UUID as name? + var uuid string + + if _, err := os.Stat(gpuJobMapInfo); err == nil { + content, err := os.ReadFile(gpuJobMapInfo) + if err != nil { + level.Error(c.logger).Log( + "msg", "Failed to get job ID for GPU", + "index", index, "err", err, + ) + + return "" + } + + if _, err := fmt.Sscanf(string(content), "%s", &uuid); err != nil { + level.Error(c.logger).Log( + "msg", "Failed to scan job ID for GPU", + "index", index, "err", err, + ) + + return "" + } + + return uuid + } + + return "" +} + // gpuOrdinalsFromProlog returns GPU ordinals of jobs from prolog generated run time files by SLURM. func (c *slurmCollector) gpuOrdinalsFromProlog(uuid string) []string { - var gpuJobID string - var gpuOrdinals []string // If there are no GPUs this loop will be skipped anyways @@ -473,35 +550,16 @@ func (c *slurmCollector) gpuOrdinalsFromProlog(uuid string) []string { // it but just to be safe. This will have a small overhead as we need to check the // correct integer index for each device index. We can live with it as there are // typically 2/4/8 GPUs per node. - for i := range c.gpuDevs { - dev := c.gpuDevs[i] - gpuJobMapInfo := fmt.Sprintf("%s/%s", *slurmGPUStatPath, dev.index) - - // NOTE: Look for file name with UUID as it will be more appropriate with - // MIG instances. - // If /run/gpustat/0 file is not found, check for the file with UUID as name? - if _, err := os.Stat(gpuJobMapInfo); err == nil { - content, err := os.ReadFile(gpuJobMapInfo) - if err != nil { - level.Error(c.logger).Log( - "msg", "Failed to get job ID for GPU", - "index", dev.index, "uuid", dev.uuid, "err", err, - ) - - continue - } - - if _, err := fmt.Sscanf(string(content), "%s", &gpuJobID); err != nil { - level.Error(c.logger).Log( - "msg", "Failed to scan job ID for GPU", - "index", dev.index, "uuid", dev.uuid, "err", err, - ) - - continue + for _, dev := range c.gpuDevs { + if dev.migEnabled { + for _, mig := range dev.migInstances { + if c.readGPUMapFile(mig.globalIndex) == uuid { + gpuOrdinals = append(gpuOrdinals, mig.globalIndex) + } } - - if gpuJobID == uuid { - gpuOrdinals = append(gpuOrdinals, dev.index) + } else { + if c.readGPUMapFile(dev.globalIndex) == uuid { + gpuOrdinals = append(gpuOrdinals, dev.globalIndex) } } } diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index 128815c6..d3e76a77 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -20,23 +20,22 @@ import ( "github.com/stretchr/testify/require" ) -func mockGPUDevices() map[int]Device { - devs := make(map[int]Device, 4) +func mockGPUDevices() []Device { + devs := make([]Device, 5) busIDs := []BusID{ - {domain: 0, bus: 7, slot: 0, function: 0}, - {domain: 0, bus: 11, slot: 0, function: 0}, - {domain: 0, bus: 72, slot: 0, function: 0}, - {domain: 0, bus: 76, slot: 0, function: 0}, - {domain: 0, bus: 77, slot: 0, function: 0}, + {domain: 0, bus: 7, device: 0, function: 0}, + {domain: 0, bus: 11, device: 0, function: 0}, + {domain: 0, bus: 72, device: 0, function: 0}, + {domain: 0, bus: 76, device: 0, function: 0}, + {domain: 0, bus: 77, device: 0, function: 0}, } for i := 0; i <= 4; i++ { - idxString := strconv.Itoa(i) devs[i] = Device{ - index: idxString, - uuid: fmt.Sprintf("GPU-%d", i), - busID: busIDs[i], + globalIndex: strconv.Itoa(i), + uuid: fmt.Sprintf("GPU-%d", i), + busID: busIDs[i], } } diff --git a/pkg/collector/testdata/nvidia-smi b/pkg/collector/testdata/nvidia-smi index bf392714..92c12a95 100755 --- a/pkg/collector/testdata/nvidia-smi +++ b/pkg/collector/testdata/nvidia-smi @@ -1,7 +1,773 @@ #!/bin/bash -printf """index, name, uuid, bus_id -0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e, 00000000:07:00.0 -1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3, 00000000:0B:00.0 -2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3, 00000000:48:00.0 -3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3, 00000000:4C:00.0""" +sub_help(){ + echo "nvidia-smi help" +} + +sub_--query(){ + printf """ + + + Fri Oct 11 18:24:09 2024 + 535.129.03 + 12.2 + 8 + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e + 0 + 92.00.25.00.08 + No + + VGPU + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 + 1 + 92.00.25.00.08 + No + + None + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + Enabled + Enabled + + + + 0 + 1 + 0 + + + 42 + 3 + 0 + 2 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 37 MiB + 19930 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 1 + 5 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 9856 MiB + 0 MiB + 12 MiB + 9843 MiB + + + 16383 MiB + 0 MiB + 16383 MiB + + + + 2 + 13 + 0 + + + 14 + 1 + 0 + 0 + 0 + 0 + + + + + 0 + + + + 4864 MiB + 0 MiB + 12 MiB + 4851 MiB + + + 8191 MiB + 0 MiB + 8191 MiB + + + + Disabled + 4000 + + N/A + N/A + + 1323920022972 + GPU-956348bc-d43d-23ed-53d4-857749fa2b67 + 2 + 92.00.25.00.08 + No + + VGPU + N/A + + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + Enabled + Enabled + + + + 0 + 1 + 0 + + + 56 + 4 + 0 + 2 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 49 MiB + 19918 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 1 + 5 + 0 + + + 28 + 2 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 9856 MiB + 0 MiB + 25 MiB + 9831 MiB + + + 16383 MiB + 0 MiB + 16383 MiB + + + + 2 + 6 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 9856 MiB + 0 MiB + 12 MiB + 9843 MiB + + + 16383 MiB + 0 MiB + 16383 MiB + + + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7 + 3 + 92.00.25.00.08 + No + + VGPU + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3 + 4 + 92.00.25.00.08 + No + + None + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3 + 5 + 92.00.25.00.08 + No + + VGPU + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-1d4d0f3e-b51a-4040-96e3-bf380f7c5728 + 6 + 92.00.25.00.08 + No + + None + N/A + + + + NVIDIA A100-PCIE-40GB + NVIDIA + Ampere + Enabled + Disabled + Enabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1323920023230 + GPU-6cc98505-fdde-461e-a93c-6935fba45a27 + 7 + 92.00.25.00.08 + No + + None + N/A + + + + +""" +} + +sub_vgpu(){ + printf """GPU 00000000:10:00.0 + Active vGPUs : 2 + vGPU ID : 3251634213 + VM UUID : 3418ce09-7aad-4506-9305-618522870574 + VM Name : ubuntu-vm-0 + vGPU Name : GRID A100-20C + vGPU Type : 472 + vGPU UUID : 9a0a5219-8563-11ef-bc81-d4857749fa2b + MDEV UUID : c73f1fa6-489e-4834-9476-d70dabd98c40 + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : N/A + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + vGPU ID : 3251634217 + VM UUID : c77c546c-c5a2-4093-bc1b-8051983e7a61 + VM Name : ubuntu-vm-1 + vGPU Name : GRID A100-20C + vGPU Type : 472 + vGPU UUID : ca12c613-8563-11ef-85cd-bcd43d23ed53 + MDEV UUID : f9702ffa-fa28-414e-a52f-e7831fd5ce41 + Guest Driver Version : N/A + License Status : N/A + GPU Instance ID : N/A + Accounting Mode : N/A + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : N/A + FB Memory Usage + Total : 20480 MiB + Used : 0 MiB + Free : 20480 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + +GPU 00000000:15:00.0 + Active vGPUs : 0 + +GPU 00000000:21:00.0 + Active vGPUs : 3 + vGPU ID : 3251634213 + VM UUID : 66712440-b722-432b-a212-f9ce01acc646 + VM Name : ubuntu-vm-2 + vGPU Name : GRID A100-1-10C + vGPU Type : 472 + vGPU UUID : 8d8b33a5-88c0-45b8-a213-7e936f9cd63b + MDEV UUID : f0f4b97c-6580-48a6-ae1b-a807d6dfe08f + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : 1 + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + vGPU ID : 3251634217 + VM UUID : c77c546c-c5a2-4093-bc1b-8051983e7a61 + VM Name : ubuntu-vm-3 + vGPU Name : GRID A100-5-20C + vGPU Type : 452 + vGPU UUID : ca12c613-8563-11ef-85cd-bcd43d23ed53 + MDEV UUID : 3b356d38-854e-48be-b376-00c72c7d119c + Guest Driver Version : N/A + License Status : N/A + GPU Instance ID : 5 + Accounting Mode : N/A + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : N/A + FB Memory Usage + Total : 20480 MiB + Used : 0 MiB + Free : 20480 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + vGPU ID : 3251634217 + VM UUID : c77c546c-c5a2-4093-bc1b-8051983e7a61 + VM Name : ubuntu-vm-4 + vGPU Name : GRID A100-13-20C + vGPU Type : 462 + vGPU UUID : ca12c613-8563-11ef-85cd-bcd43d23ed53 + MDEV UUID : 5bb3bad7-ce3b-4aa5-84d7-b5b33cf9d45e + Guest Driver Version : N/A + License Status : N/A + GPU Instance ID : 5 + Accounting Mode : N/A + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : N/A + FB Memory Usage + Total : 20480 MiB + Used : 0 MiB + Free : 20480 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + +GPU 00000000:81:00.0 + Active vGPUs : 3 + vGPU ID : 3251634213 + VM UUID : 66712440-b722-432b-a212-f9ce01acc646 + VM Name : ubuntu-vm-5 + vGPU Name : GRID A100-1-10C + vGPU Type : 472 + vGPU UUID : 8d8b33a5-88c0-45b8-a213-7e936f9cd63b + MDEV UUID : 4f84d324-5897-48f3-a4ef-94c9ddf23d78 + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : 1 + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + vGPU ID : 3251634213 + VM UUID : 66712440-b722-432b-a212-f9ce01acc646 + VM Name : ubuntu-vm-6 + vGPU Name : GRID A100-5-10C + vGPU Type : 472 + vGPU UUID : 8d8b33a5-88c0-45b8-a213-7e936f9cd63b + MDEV UUID : 3058eb95-0899-4c3d-90e9-e20b6c14789f + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : 5 + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + vGPU ID : 3251634213 + VM UUID : 66712440-b722-432b-a212-f9ce01acc646 + VM Name : ubuntu-vm-7 + vGPU Name : GRID A100-6-10C + vGPU Type : 472 + vGPU UUID : 8d8b33a5-88c0-45b8-a213-7e936f9cd63b + MDEV UUID : 9f0d5993-9778-40c7-a721-3fec93d6b3a9 + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : 6 + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + +GPU 00000000:83:00.0 + Active vGPUs : 0 + +GPU 00000000:85:00.0 + Active vGPUs : 1 + vGPU ID : 3251634213 + VM UUID : 3418ce09-7aad-4506-9305-618522870574 + VM Name : ubuntu-vm-8 + vGPU Name : GRID A100-20C + vGPU Type : 472 + vGPU UUID : 9a0a5219-8563-11ef-bc81-d4857749fa2b + MDEV UUID : 64c3c4ae-44e1-45b8-8d46-5f76a1fa9824 + Guest Driver Version : 470.256.02 + License Status : Unlicensed (Restricted) + GPU Instance ID : N/A + Accounting Mode : Disabled + ECC Mode : Enabled + Accounting Buffer Size : 4000 + Frame Rate Limit : 15 FPS + FB Memory Usage + Total : 20480 MiB + Used : 640 MiB + Free : 19840 MiB + Utilization + Gpu : 0 %% + Memory : 0 %% + Encoder : 0 %% + Decoder : 0 %% + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + +GPU 00000000:87:00.0 + Active vGPUs : 0 + +GPU 00000000:89:00.0 + Active vGPUs : 0 +""" +} + +subcommand=$1 +case $subcommand in + "" | "-h" | "--help") + sub_help + ;; + *) + shift + sub_${subcommand} $@ + if [ $? = 127 ]; then + echo "Error: '$subcommand' is not a known subcommand." >&2 + echo " Run '$ProgName --help' for a list of known subcommands." >&2 + exit 1 + fi + ;; +esac diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt index 4846290a..0fbceba2 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt @@ -1,110 +1,113 @@ # HELP ceems_compute_unit_blkio_read_total_bytes Total block IO read bytes # TYPE ceems_compute_unit_blkio_read_total_bytes gauge +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 3.25280768e+08 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 3.25280768e+08 ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.25280768e+08 -ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.25280768e+08 ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.25280768e+08 -ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.25280768e+08 # HELP ceems_compute_unit_blkio_read_total_requests Total block IO read requests # TYPE ceems_compute_unit_blkio_read_total_requests gauge +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 10957 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 10957 ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 10957 -ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 10957 ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 10957 -ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 10957 # HELP ceems_compute_unit_blkio_write_total_bytes Total block IO write bytes # TYPE ceems_compute_unit_blkio_write_total_bytes gauge +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 3.088384e+07 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 3.088384e+07 ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.088384e+07 -ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.088384e+07 ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.088384e+07 -ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.088384e+07 # HELP ceems_compute_unit_blkio_write_total_requests Total block IO write requests # TYPE ceems_compute_unit_blkio_write_total_requests gauge +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4803 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4803 ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4803 -ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4803 ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4803 -ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4803 # HELP ceems_compute_unit_cpu_psi_seconds Total CPU PSI in seconds # TYPE ceems_compute_unit_cpu_psi_seconds gauge +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds # TYPE ceems_compute_unit_cpu_system_seconds_total counter +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0.45 ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0.45 -ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0.45 ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.45 -ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0.45 # HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds # TYPE ceems_compute_unit_cpu_user_seconds_total counter +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0.39 ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0.39 -ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0.39 ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.39 -ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0.39 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 -# HELP ceems_compute_unit_gpu_index_flag Indicates running instance on GPU, 1=instance running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates running instance using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3/",hindex="/gpu-9",hostname="",index="9",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3/",hindex="/gpu-8",hostname="",index="8",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-6cc98505-fdde-461e-a93c-6935fba45a27/",hindex="/gpu-11",hostname="",index="11",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.5 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 2.1086208e+07 ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2.1086208e+07 -ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2.1086208e+07 ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2.1086208e+07 -ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2.1086208e+07 # HELP ceems_compute_unit_memory_fail_count Memory fail count # TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memory_psi_seconds Total memory PSI in seconds # TYPE ceems_compute_unit_memory_psi_seconds gauge +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes # TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 1.0407936e+07 ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.0407936e+07 -ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.0407936e+07 ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.0407936e+07 -ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.0407936e+07 # HELP ceems_compute_unit_memory_total_bytes Memory total in bytes # TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 2.01362030592e+11 ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2.01362030592e+11 -ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2.01362030592e+11 ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2.01362030592e+11 -ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2.01362030592e+11 # HELP ceems_compute_unit_memory_used_bytes Memory used in bytes # TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4.0194048e+07 ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.0194048e+07 -ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.0194048e+07 ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.0194048e+07 -ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.0194048e+07 # HELP ceems_compute_unit_memsw_fail_count Swap fail count # TYPE ceems_compute_unit_memsw_fail_count gauge +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memsw_total_bytes Swap total in bytes # TYPE ceems_compute_unit_memsw_total_bytes gauge +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 9.223372036854772e+18 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 9.223372036854772e+18 ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 9.223372036854772e+18 -ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 9.223372036854772e+18 ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 9.223372036854772e+18 -ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 9.223372036854772e+18 # HELP ceems_compute_unit_memsw_used_bytes Swap used in bytes # TYPE ceems_compute_unit_memsw_used_bytes gauge +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4.032512e+07 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4.032512e+07 ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.032512e+07 -ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.032512e+07 ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.032512e+07 -ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.032512e+07 # HELP ceems_compute_units Total number of jobs # TYPE ceems_compute_units gauge ceems_compute_units{hostname="",manager="libvirt"} 4 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt index f8a7740e..40279c9b 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt @@ -13,12 +13,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 0 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 0 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 0 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 0.6 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 0.2 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 2.1086208e+07 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt index f8a7740e..40279c9b 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt @@ -13,12 +13,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 0 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 0 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 0 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 0.6 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 0.2 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 2.1086208e+07 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt index 45b09de5..c477fcdf 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -18,12 +18,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 2 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170000800c/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170003580c/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170005280c/",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20180003050c/",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 0 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt index 73dd8670..24784878 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt @@ -13,12 +13,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 2 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170000800c/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170003580c/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20170005280c/",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="20180003050c/",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 0 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt index 2e5e3f79..238b0b4b 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt @@ -1,110 +1,113 @@ # HELP ceems_compute_unit_blkio_read_total_bytes Total block IO read bytes # TYPE ceems_compute_unit_blkio_read_total_bytes gauge +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 3.0206976e+07 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 3.0206976e+07 ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.0206976e+07 -ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.0206976e+07 ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.0206976e+07 -ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.0206976e+07 # HELP ceems_compute_unit_blkio_read_total_requests Total block IO read requests # TYPE ceems_compute_unit_blkio_read_total_requests gauge +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1141 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 1141 ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1141 -ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1141 ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1141 -ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1141 # HELP ceems_compute_unit_blkio_write_total_bytes Total block IO write bytes # TYPE ceems_compute_unit_blkio_write_total_bytes gauge +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1.00337664e+09 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 1.00337664e+09 ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.00337664e+09 -ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.00337664e+09 ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.00337664e+09 -ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.00337664e+09 # HELP ceems_compute_unit_blkio_write_total_requests Total block IO write requests # TYPE ceems_compute_unit_blkio_write_total_requests gauge +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 14997 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 14997 ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 14997 -ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 14997 ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 14997 -ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 14997 # HELP ceems_compute_unit_cpu_psi_seconds Total CPU PSI in seconds # TYPE ceems_compute_unit_cpu_psi_seconds gauge +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds # TYPE ceems_compute_unit_cpu_system_seconds_total counter +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 115.777502 ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 115.777502 -ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 115.777502 ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 115.777502 -ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 115.777502 # HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds # TYPE ceems_compute_unit_cpu_user_seconds_total counter +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 60375.292848 ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 60375.292848 -ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 60375.292848 ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 60375.292848 -ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 2 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 2 ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2 -ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2 ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2 -ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2 -# HELP ceems_compute_unit_gpu_index_flag Indicates running instance on GPU, 1=instance running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates running instance using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3/",hindex="/gpu-9",hostname="",index="9",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3/",hindex="/gpu-8",hostname="",index="8",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-6cc98505-fdde-461e-a93c-6935fba45a27/",hindex="/gpu-11",hostname="",index="11",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.5 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memory_fail_count Memory fail count # TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memory_psi_seconds Total memory PSI in seconds # TYPE ceems_compute_unit_memory_psi_seconds gauge +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes # TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4.098592768e+09 ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.098592768e+09 -ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.098592768e+09 ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.098592768e+09 -ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.098592768e+09 # HELP ceems_compute_unit_memory_total_bytes Memory total in bytes # TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4.294967296e+09 ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.294967296e+09 -ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.294967296e+09 ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.294967296e+09 -ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.294967296e+09 # HELP ceems_compute_unit_memory_used_bytes Memory used in bytes # TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 4.111491072e+09 ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.111491072e+09 -ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.111491072e+09 ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.111491072e+09 -ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.111491072e+09 # HELP ceems_compute_unit_memsw_fail_count Swap fail count # TYPE ceems_compute_unit_memsw_fail_count gauge +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_unit_memsw_total_bytes Swap total in bytes # TYPE ceems_compute_unit_memsw_total_bytes gauge +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 1.6042172416e+10 ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.6042172416e+10 -ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.6042172416e+10 ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.6042172416e+10 -ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.6042172416e+10 # HELP ceems_compute_unit_memsw_used_bytes Swap used in bytes # TYPE ceems_compute_unit_memsw_used_bytes gauge +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="2896bdd5-dbc2-4339-9d8e-ddd838bf35d3"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="4de89c5b-50d7-4d30-a630-14e135380fe8"} 0 ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 -ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 -ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 # HELP ceems_compute_units Total number of jobs # TYPE ceems_compute_units gauge ceems_compute_units{hostname="",manager="libvirt"} 4 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt new file mode 100644 index 00000000..e2fa85c8 --- /dev/null +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt @@ -0,0 +1,167 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total counter +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",uuid="1009248"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",uuid="1009249"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",uuid="1009250"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total counter +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009248"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009249"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009250"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 2 +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3/",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3/",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009249"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009250"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",uuid="1009248"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",uuid="1009249"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",uuid="1009250"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",uuid="1009248"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",uuid="1009249"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",uuid="1009250"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",uuid="1009248"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",uuid="1009249"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",uuid="1009250"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",uuid="1009248"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",uuid="1009249"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",uuid="1009250"} 4.111491072e+09 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",uuid="1009249"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",uuid="1009250"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",uuid="1009249"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",uuid="1009249"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",uuid="1009249"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",uuid="1009250"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",uuid="1009249"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",uuid="1009249"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 +# HELP ceems_cpu_count Number of CPUs. +# TYPE ceems_cpu_count gauge +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 +# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. +# TYPE ceems_cpu_seconds_total counter +ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 +ceems_cpu_seconds_total{hostname="",mode="iowait"} 35.52 +ceems_cpu_seconds_total{hostname="",mode="irq"} 0.02 +ceems_cpu_seconds_total{hostname="",mode="nice"} 6.12 +ceems_cpu_seconds_total{hostname="",mode="softirq"} 39.44 +ceems_cpu_seconds_total{hostname="",mode="steal"} 0 +ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 +ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 +# HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. +# TYPE ceems_exporter_build_info gauge +# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts +# TYPE ceems_ipmi_dcmi_avg_watts gauge +ceems_ipmi_dcmi_avg_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts +# TYPE ceems_ipmi_dcmi_current_watts gauge +ceems_ipmi_dcmi_current_watts{hostname=""} 49 +# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts +# TYPE ceems_ipmi_dcmi_max_watts gauge +ceems_ipmi_dcmi_max_watts{hostname=""} 304 +# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts +# TYPE ceems_ipmi_dcmi_min_watts gauge +ceems_ipmi_dcmi_min_watts{hostname=""} 6 +# HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. +# TYPE ceems_meminfo_MemAvailable_bytes gauge +ceems_meminfo_MemAvailable_bytes{hostname=""} 0 +# HELP ceems_meminfo_MemFree_bytes Memory information field MemFree_bytes. +# TYPE ceems_meminfo_MemFree_bytes gauge +ceems_meminfo_MemFree_bytes{hostname=""} 4.50891776e+08 +# HELP ceems_meminfo_MemTotal_bytes Memory information field MemTotal_bytes. +# TYPE ceems_meminfo_MemTotal_bytes gauge +ceems_meminfo_MemTotal_bytes{hostname=""} 1.6042172416e+10 +# HELP ceems_rapl_package_joules_total Current RAPL package value in joules +# TYPE ceems_rapl_package_joules_total counter +ceems_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 258218.293244 +ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP ceems_rapl_package_power_limit_watts_total Current RAPL package power limit in watts +# TYPE ceems_rapl_package_power_limit_watts_total counter +ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 +ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 +# HELP ceems_rdma_cqe_len_active Length of active CQs +# TYPE ceems_rdma_cqe_len_active gauge +ceems_rdma_cqe_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 8190 +ceems_rdma_cqe_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 8190 +# HELP ceems_rdma_cqs_active Number of active CQs +# TYPE ceems_rdma_cqs_active gauge +ceems_rdma_cqs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 2 +ceems_rdma_cqs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 2 +# HELP ceems_rdma_mrs_active Number of active MRs +# TYPE ceems_rdma_mrs_active gauge +ceems_rdma_mrs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 2 +ceems_rdma_mrs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 2 +# HELP ceems_rdma_mrs_len_active Length of active MRs +# TYPE ceems_rdma_mrs_len_active gauge +ceems_rdma_mrs_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 4.194304e+06 +ceems_rdma_mrs_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 4.194304e+06 +# HELP ceems_rdma_port_data_received_bytes_total Number of data octets received on all links +# TYPE ceems_rdma_port_data_received_bytes_total counter +ceems_rdma_port_data_received_bytes_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 1.380366808104e+12 +ceems_rdma_port_data_received_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.884894436e+09 +ceems_rdma_port_data_received_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 9.841747136e+09 +ceems_rdma_port_data_received_bytes_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 7.2505381512e+10 +# HELP ceems_rdma_port_data_transmitted_bytes_total Number of data octets transmitted on all links +# TYPE ceems_rdma_port_data_transmitted_bytes_total counter +ceems_rdma_port_data_transmitted_bytes_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 1.094233306172e+12 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 1.0603645318e+11 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 1.0616142756e+11 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 1.1523046035392e+13 +# HELP ceems_rdma_port_packets_received_total Number of packets received on all VLs by this port (including errors) +# TYPE ceems_rdma_port_packets_received_total counter +ceems_rdma_port_packets_received_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 6.38036947e+08 +ceems_rdma_port_packets_received_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.7169372e+07 +ceems_rdma_port_packets_received_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 8.9332064e+07 +ceems_rdma_port_packets_received_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 5.41889824e+08 +# HELP ceems_rdma_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) +# TYPE ceems_rdma_port_packets_transmitted_total counter +ceems_rdma_port_packets_transmitted_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 5.68318856e+08 +ceems_rdma_port_packets_transmitted_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.5734114e+07 +ceems_rdma_port_packets_transmitted_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 8.862285e+07 +ceems_rdma_port_packets_transmitted_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 1.0907922116e+10 +# HELP ceems_rdma_qps_active Number of active QPs +# TYPE ceems_rdma_qps_active gauge +ceems_rdma_qps_active{device="mlx5_0",hostname="",manager="slurm",port="1",uuid="1320003"} 16 +ceems_rdma_qps_active{device="mlx5_0",hostname="",manager="slurm",port="1",uuid="4824887"} 16 +# HELP ceems_rdma_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer) +# TYPE ceems_rdma_state_id gauge +ceems_rdma_state_id{device="hfi1_0",hostname="",manager="slurm",port="1"} 4 +ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="1"} 4 +ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="2"} 4 +ceems_rdma_state_id{device="mlx5_0",hostname="",manager="slurm",port="1"} 4 +# HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. +# TYPE ceems_scrape_collector_duration_seconds gauge +# HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. +# TYPE ceems_scrape_collector_success gauge +ceems_scrape_collector_success{collector="cpu"} 1 +ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="meminfo"} 1 +ceems_scrape_collector_success{collector="rapl"} 1 +ceems_scrape_collector_success{collector="slurm"} 1 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index 8b96c785..d10c1e07 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -13,12 +13,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 2 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 0.6 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 0.2 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 0 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt index 4bc7b85b..b2be1a7c 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt @@ -13,12 +13,12 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",uuid="1009 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009248"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009249"} 2 ceems_compute_unit_cpus{hostname="",manager="slurm",uuid="1009250"} 2 -# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# HELP ceems_compute_unit_gpu_index_flag A value > 0 indicates the job using current GPU # TYPE ceems_compute_unit_gpu_index_flag gauge -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 1 -ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3/",hindex="/gpu-1",hostname="",index="1",manager="slurm",uuid="1009250"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1",hindex="/gpu-2",hostname="",index="2",manager="slurm",uuid="1009248"} 0.6 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-956348bc-d43d-23ed-53d4-857749fa2b67/5",hindex="/gpu-3",hostname="",index="3",manager="slurm",uuid="1009248"} 0.2 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e/",hindex="/gpu-0",hostname="",index="0",manager="slurm",uuid="1009249"} 1 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",uuid="1009248"} 0 diff --git a/pkg/collector/testdata/qemu/instance-00000002.xml b/pkg/collector/testdata/qemu/instance-00000002.xml index 6aecd171..488ef9e8 100644 --- a/pkg/collector/testdata/qemu/instance-00000002.xml +++ b/pkg/collector/testdata/qemu/instance-00000002.xml @@ -117,15 +117,13 @@ or other application using the libvirt API. -
+
-
-
+
-
diff --git a/pkg/collector/testdata/qemu/instance-00000003.xml b/pkg/collector/testdata/qemu/instance-00000003.xml index 5237d62f..ad8a6cd9 100644 --- a/pkg/collector/testdata/qemu/instance-00000003.xml +++ b/pkg/collector/testdata/qemu/instance-00000003.xml @@ -7,7 +7,7 @@ or other application using the libvirt API. instance-00000003 - bf9ccd0f-4cd7-4ea2-8855-b56467326f61 + 2896bdd5-dbc2-4339-9d8e-ddd838bf35d3 @@ -116,9 +116,13 @@ or other application using the libvirt API. -
+
+ + + + +
-
diff --git a/pkg/collector/testdata/qemu/instance-00000004.xml b/pkg/collector/testdata/qemu/instance-00000004.xml index 84b4bde6..66434bb8 100644 --- a/pkg/collector/testdata/qemu/instance-00000004.xml +++ b/pkg/collector/testdata/qemu/instance-00000004.xml @@ -7,7 +7,7 @@ or other application using the libvirt API. instance-00000004 - 5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e + 4de89c5b-50d7-4d30-a630-14e135380fe8 diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 60fe3418..967889d2 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -61,7 +61,12 @@ then cgroups_mode="unified" desc="Cgroups V2 with nVIDIA GPU and ipmiutil" fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt' - elif [ "${scenario}" = "exporter-cgroups-v2-amd-ipmitool" ] + elif [ "${scenario}" = "exporter-cgroups-v2-nvidia-gpu-reordering" ] + then + cgroups_mode="unified" + desc="Cgroups V2 with nVIDIA GPU reordering" + fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt' + elif [ "${scenario}" = "exporter-cgroups-v2-amd-ipmitool" ] then cgroups_mode="unified" desc="Cgroups V2 with AMD GPU and ipmitool" @@ -367,6 +372,26 @@ then --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & + elif [ "${scenario}" = "exporter-cgroups-v2-nvidia-gpu-reordering" ] + then + PATH="${PWD}/pkg/collector/testdata/ipmi/ipmiutils:${PATH}" ./bin/ceems_exporter \ + --path.sysfs="pkg/collector/testdata/sys" \ + --path.cgroupfs="pkg/collector/testdata/sys/fs/cgroup" \ + --path.procfs="pkg/collector/testdata/proc" \ + --collector.cgroups.force-version="v2" \ + --collector.slurm \ + --collector.slurm.gpu-order-map="0:0,1:1,2:4,3:5,4:2.1,5:2.5,6:2.13,7:3.1,8:3.5,9:3.13,10:6,11:7" \ + --collector.gpu.type="nvidia" \ + --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ + --collector.slurm.gpu-job-map-path="pkg/collector/testdata/gpujobmap" \ + --collector.rdma.stats \ + --collector.rdma.cmd="pkg/collector/testdata/rdma" \ + --collector.empty-hostname-label \ + --collector.ipmi_dcmi.test-mode \ + --web.listen-address "127.0.0.1:${port}" \ + --web.disable-exporter-metrics \ + --log.level="debug" > "${logfile}" 2>&1 & + elif [ "${scenario}" = "exporter-cgroups-v2-amd-ipmitool" ] then PATH="${PWD}/pkg/collector/testdata/ipmi/openipmi:${PATH}" ./bin/ceems_exporter \ diff --git a/scripts/install_clang.sh b/scripts/install_clang.sh index d13c1aa8..0c0e7638 100755 --- a/scripts/install_clang.sh +++ b/scripts/install_clang.sh @@ -4,22 +4,29 @@ set -exo pipefail # This script only works for Ubuntu derivates and it is meant to be # used in CI to install clang in golang builder containers. +create_symlinks() { + echo "Creating symlinks" + $SUDO ln -vsnf /usr/lib/llvm-18/bin/clang /usr/bin/clang + $SUDO ln -vsnf /usr/lib/llvm-18/bin/llc /usr/bin/llc +} + +# Setup sudo prefix +SUDO='' +if (( $EUID != 0 )); then + SUDO='sudo' +fi + # Check if clang exists. If it exists, we need to ensure that it # is at least of version >= 18 if [ -x "$(command -v clang)" ]; then clang_major_version=$(clang -v 2>&1 | grep version | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+" | cut -d "." -f1) if (( ${clang_major_version} >= 18 )); then echo "clang >=18 already installed. Skipping installation...." + create_symlinks exit 0 fi fi -# Setup sudo prefix -SUDO='' -if (( $EUID != 0 )); then - SUDO='sudo' -fi - # Install clang stable version dependencies $SUDO apt-get update && $SUDO apt-get install -y --no-install-recommends \ wget lsb-release wget software-properties-common gnupg \ @@ -29,5 +36,4 @@ $SUDO apt-get update && $SUDO apt-get install -y --no-install-recommends \ $SUDO bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" # Create necessary symlinks -$SUDO ln -vsnf /usr/lib/llvm-18/bin/clang /usr/bin/clang -$SUDO ln -vsnf /usr/lib/llvm-18/bin/llc /usr/bin/llc +create_symlinks diff --git a/website/docs/01-philisophy.md b/website/docs/01-philisophy.md index 00c45279..542dc871 100644 --- a/website/docs/01-philisophy.md +++ b/website/docs/01-philisophy.md @@ -80,6 +80,12 @@ current exporter takes care of the GPU index to compute unit mapping. These two can be used together using PromQL to show the metrics of GPU metrics of a given compute unit. +In the case of vGPUs supported by NVIDIA Grid, the energy consumed by each vGPU is +estimated using the total energy consumption of physical GPU and number of active +vGPUs scheduled on that physical GPU. Similarly, in the case of Multi GPU Instances (MIG), +the energy consumption of each MIG instance is estimated based on the relative number +of Streaming Multiprocessors (SM) and total energy consumption of the physical GPU. + ## Performance metrics Presenting energy and emission metrics is only one side of the story. This will @@ -93,5 +99,6 @@ energy efficient. Currently, CEEMS provides performance metrics for CPU. It is possible to gather performance metrics for nVIDIA GPUs as well as long as operators install and enable -nVIDIA DCGM libraries. More details can be found in [DCGM](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#profiling-metrics) +nVIDIA DCGM libraries. More details can be found in +[DCGM](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#profiling-metrics) docs. diff --git a/website/docs/advanced/multi-cluster.md b/website/docs/advanced/multi-cluster.md index 73f2c538..5ced72d9 100644 --- a/website/docs/advanced/multi-cluster.md +++ b/website/docs/advanced/multi-cluster.md @@ -1,5 +1,5 @@ --- -sidebar_position: 1 +sidebar_position: 2 --- # Multi Cluster Scenarios diff --git a/website/docs/advanced/power-consumption.md b/website/docs/advanced/power-consumption.md new file mode 100644 index 00000000..8446783e --- /dev/null +++ b/website/docs/advanced/power-consumption.md @@ -0,0 +1,93 @@ +--- +sidebar_position: 1 +--- + +# Energy consumption estimation + +## CPU + +TODO + +## GPU + +CEEMS leverages [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) and +[amd-smi-exporter](https://github.com/amd/amd_smi_exporter) to get power +consumption of GPUs. When the resource manager uses full physical GPU, estimating +the power consumption of each compute unit is straight-forward as CEEMS exporter +already exports a metric that maps the compute unit ID to GPU ordinal. However, +NVIDIA GPUs support sharing of one physical GPU amongst different compute units +using Multi Instance GPU (MIG) and GRID vGPU strategies. Currently, `dcgm-exporter` +does not estimate power consumption of each MIG instance or vGPU. Thus, CEEMS uses +following approximation to estimate power consumption of shared GPU instances. + +### MIG + +CEEMS exporter uses an approximation based on number of +Streaming Multiprocessors (SM) for each +MIG instance profile. For instance, in a typical A100 40GB card, a full GPU can be +split into following profiles: + +```bash +$ nvidia-smi mig -lgi ++----------------------------------------------------+ +| GPU instances: | +| GPU Name Profile Instance Placement | +| ID ID Start:Size | +|====================================================| +| 0 MIG 1g.5gb 19 13 6:1 | ++----------------------------------------------------+ +| 0 MIG 2g.10gb 14 5 4:2 | ++----------------------------------------------------+ +| 0 MIG 4g.20gb 5 1 0:4 | ++----------------------------------------------------+ +``` + +It means MIG instance `4g.20gb` has 4/7 of SMs, `2g.10gb` has 2/7 of SMs and `1g.5gb` has +1/7 of SMs. Consequently, the power consumed by entire GPU is divided amongst the different +MIG instances in the ratio of their SMs respectively. For example, if the physical GPU's +power consumption is 140 W, the power consumption of each MIG profile will be estimated as +follows: + +- `1g.5gb`: 140 * (1/7) = 20 W +- `2g.10gb`: 140 * (2/7) = 40 W +- `4g.20gb`: 140 * (4/7) = 40 W + +The exporter will export the coefficient for each MIG instance which can be used along with +power consumption metric of `dcgm-exporter` to estimate power consumption of individual MIG +instances. + +### vGPU + +In the case of Libvirt, besides MIG it supports GRID vGPU time sharing. Following scenarios +are possible when GPUs are present on the compute node: + +- PCI pass through of NVIDIA and AMD GPUs to the guest VMs +- Virtualization of full GPUs using NVIDIA Grid vGPU +- Virtualization of MIG GPUs using NVIDIA Grid vGPU + +If a GPU is added to VM using PCI pass through, this GPU will not be available +for the hypervisor and hence, it cannot be queried or monitored. This is due to +the fact that the GPU will be unbound from the hypervisor and bound to guest. +Thus, energy consumption and GPU metrics for GPUs using PCI passthrough +**will only be available in the guest**. + +NVIDIA's vGPU uses mediated devices to expose GPUs in the guest and thus, +GPUs can be queried and monitored from both hypervisor and guest. However, +CEEMS rely on [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) to +export GPU energy consumption and usage metrics and it does not support +usage and energy consumption metrics for vGPUs. Thus, CEEMS exporter uses +the following approximation method to estimate energy consumption of each +vGPU which in-turn gives energy consumption of each guest VM. + +NVIDIA Grid vGPU time slicing divides the GPU resources equally among all the +active vGPUs at any given time and schedule the work on the given physical +GPU. Thus, if there are 4 vGPUs active on a given physical GPU, each vGPU +will get 25% of the full GPU compute power. Thus, a reasonable approximation +would be to split the current physical GPU power consumption equally among all +vGPUs. The same applies when using vGPU on the top of MIG partition. MIG +already divides the physical GPU into different profiles by assigning a given +number of SMs for each profile as discussed in SLURM collector above. When +multiple vGPUs are running on the top of MIG instance, this coefficient is +further divided by number of active vGPUs. For instance, if there are 4 vGPUs +scheduled on a MIG profile `4g.20gb` where the physical GPU is consuming +140 W, the power consumption of each vGPU would be 140*(4/7)*(1/4) = 20 W. diff --git a/website/docs/components/ceems-exporter.md b/website/docs/components/ceems-exporter.md index a8272551..f39329df 100644 --- a/website/docs/components/ceems-exporter.md +++ b/website/docs/components/ceems-exporter.md @@ -255,16 +255,9 @@ of RDMA very well. Slurm collector exports the job related metrics like usage of CPU, DRAM, RDMA, _etc_. This is done by walking through the cgroups created by SLURM daemon on compute node on every scrape request. As walking through the cgroups pseudo file system is _very cheap_, -this will zero zero to negligible impact on the actual job. - -The exporter has been heavily inspired by -[cgroups_exporter](https://github.com/treydock/cgroup_exporter) and it supports both -cgroups **v1** and **v2**. For jobs with GPUs, we must the GPU ordinals allocated to -each job so that we can match GPU metrics scrapped by either -[dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) or -[amd-smi-exporter](https://github.com/amd/amd_smi_exporter) to jobs. Unfortunately, -this information is not available post-mortem of the job and hence, we need to export -the mapping related to job ID to GPU ordinals. +this will zero zero to negligible impact on the actual job. The exporter has been +heavily inspired by [cgroups_exporter](https://github.com/treydock/cgroup_exporter) +and it supports both cgroups **v1** and **v2**. :::warning[WARNING] @@ -279,6 +272,13 @@ be found in [Configuration](../configuration/resource-managers.md) section. ::: +For jobs with GPUs, we must the GPU ordinals allocated to +each job so that we can match GPU metrics scrapped by either +[dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) or +[amd-smi-exporter](https://github.com/amd/amd_smi_exporter) to jobs. Unfortunately, +this information is not available post-mortem of the job and hence, the CEEMS exporter +exports a metric thats maps the job ID to GPU ordinals. + Currently, the list of job related metrics exported by SLURM exporter are as follows: - Job current CPU time in user and system mode @@ -316,20 +316,7 @@ from cgroups. The collector supports both cgroups v1 and v2. When GPUs are present on the compute node, like in the case of Slurm, we will need information on which GPU is used by which VM. This information can be -obtained in libvirt's XML file that keeps the state of the VM. However, there -are few caveats here: - -- If a GPU is added to VM using PCI pass through, this GPU will not be available -for the hypervisor and hence, it cannot be queried or monitored. This is due to -the fact that the GPU will be unbound from the hypervisor and bound to guest. -Thus, energy consumption and GPU metrics for GPUs using PCI passthrough -**will only be available in the guest**. - -- NVIDIA's vGPU uses mediated devices to expose GPUs in the guest and thus, -GPUs can be queried and monitored from both hypervisor and guest. However, -CEEMS rely on [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) to -export GPU energy consumption and usage metrics and it does not support -usage and energy consumption metrics for vGPUs. +obtained in libvirt's XML file that keeps the state of the VM. - NVIDIA's MIG instances uses a similar approach to vGPU to expose GPUs inside guests and hence, similar limitations apply. diff --git a/website/docs/configuration/ceems-exporter.md b/website/docs/configuration/ceems-exporter.md index cdd73115..4784c1cf 100644 --- a/website/docs/configuration/ceems-exporter.md +++ b/website/docs/configuration/ceems-exporter.md @@ -25,8 +25,7 @@ a consistent styling. They will be removed in `v1.0.0`. Although fetching metrics from cgroups do not need any additional privileges, getting GPU ordinal to job ID needs extra privileges. This is due to the fact that this -information is not readily available in cgroups (at least in v2 where devices are -bound to cgroups using BPF programs). Currently, the exporter supports two different +information is not readily available in cgroups. Currently, the exporter supports two different ways to get the GPU ordinals to job ID map. - Reading environment variables `SLURM_STEP_GPUS` and/or `SLURM_JOB_GPUS` of job from @@ -44,20 +43,6 @@ On the other hand, if the operators do not wish to add any privileges to exporte process, they can use the second approach but this requires some configuration additions to SLURM controller to execute a prolog and epilog script for each job. - - A sample prolog script to get GPU ordinals is as follows: ```bash @@ -95,6 +80,96 @@ ceems_exporter --collector.slum --collector.slurm.gpu-job-map-path=/run/gpujobma With above configuration, the exporter should export GPU ordinal mapping along with other metrics of slurm collector. +When compute nodes uses a mix of full physical GPUs and MIG instances (NVIDIA), the +ordering of GPUs by SLURM is undefined and it can depend on how the compute nodes are +configured. More details can be found in this +[bug report](https://support.schedmd.com/show_bug.cgi?id=21163). If ordering of GPUs +does not match between `nvidia-smi` and SLURM, operators need to configure CEEMS +exporter appropriately to provide the information about ordering of GPUs known to SLURM. + +For example if there are 2 A100 GPUs on a compute node where MIG is enabled only on +GPU 0. + +```bash +$ nvidia-smi +Fri Oct 11 12:04:56 2024 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA A100-PCIE-40GB On | 00000000:21:00.0 Off | On | +| N/A 28C P0 31W / 250W | 50MiB / 40960MiB | N/A Default | +| | | Enabled | ++-----------------------------------------+----------------------+----------------------+ +| 1 NVIDIA A100-PCIE-40GB On | 00000000:81:00.0 Off | 0 | +| N/A 27C P0 34W / 250W | 4MiB / 40960MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| MIG devices: | ++------------------+--------------------------------+-----------+-----------------------+ +| GPU GI CI MIG | Memory-Usage | Vol| Shared | +| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG | +| | | ECC| | +|==================+================================+===========+=======================| +| 0 3 0 0 | 12MiB / 9856MiB | 14 0 | 1 0 1 0 0 | +| | 0MiB / 16383MiB | | | ++------------------+--------------------------------+-----------+-----------------------+ +| 0 4 0 1 | 12MiB / 9856MiB | 14 0 | 1 0 1 0 0 | +| | 0MiB / 16383MiB | | | ++------------------+--------------------------------+-----------+-----------------------+ +| 0 5 0 2 | 12MiB / 9856MiB | 14 0 | 1 0 1 0 0 | +| | 0MiB / 16383MiB | | | ++------------------+--------------------------------+-----------+-----------------------+ +| 0 6 0 3 | 12MiB / 9856MiB | 14 0 | 1 0 1 0 0 | +| | 0MiB / 16383MiB | | | ++------------------+--------------------------------+-----------+-----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| No running processes found | ++---------------------------------------------------------------------------------------+ +``` + +In this case `nvidia-smi` orders GPUs and MIG instances as follows: + +- 0.3 +- 0.4 +- 0.5 +- 0.6 +- 1 + +where `0.3` indicates GPU 0 and GPU Instance ID (GI ID) 3. However, SLURM can order these +GPUs as follows depending on certain configurations: + +- 1 +- 0.3 +- 0.4 +- 0.5 +- 0.6 + +The difference between two orderings is that SLURM is placing full physical GPU at the top +and then enumerating MIG instances. The operators can verify the ordering of SLURM GPUs +by reserving a job and looking at `SLURM_JOB_GPUS` or `SLURM_STEP_GPUS` environment variables. +If the odering is different between `nvidia-smi` and SLURM as demonstrated in this example, +we need to define a map from SLURM order to `nvidia-smi` order and pass it to exporter using +`--collector.slurm.gpu-order-map` CLI flag. In this case, the map definition would be +`--collector.slurm.gpu-order-map=0:1,1:0.3,2:0.4,3:0.5,4:0.6`. The nomenclature is +`:.[]` delimited by `,`. From +SLURM's point-of-view, GPU 0 is GPU 1 from `nvidia-smi`'s point-of-view and hence first +element is `0:1`. Similarly, SLURM's GPU 1 is `nvidia-smi`'s GPU `0.3` (GPU 0 with GI ID 3) +and hence second element is `1:0.3` and so on. As stated above, if the compute node uses +either full GPUs and if all GPUs are MIG partitioned, the order between SLURM and `nvidia-smi` +would be the same. In any case, it is a good idea to ensure the GPU indexes agree between +SLURM and `nvidia-smi` and configuring CEEMS exporter appropriately. + As discussed in [Components](../components/ceems-exporter.md#slurm-collector), Slurm collector supports [perf](../components/ceems-exporter.md#perf-sub-collector) and [eBPF](../components/ceems-exporter.md#ebpf-sub-collector) sub-collectors. These